1. /*
  2. * \x20@(#)Pattern.java 1.109 04/06/28
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. package java.util.regex;
  8. import java.security.AccessController;
  9. import java.security.PrivilegedAction;
  10. import java.text.CharacterIterator;
  11. import sun.text.Normalizer;
  12. import java.util.ArrayList;
  13. import java.util.HashMap;
  14. /**
  15. * A compiled representation of a regular expression.
  16. *
  17. * <p> A regular expression, specified as a string, must first be compiled into
  18. * an instance of this class. The resulting pattern can then be used to create
  19. * a {@link Matcher} object that can match arbitrary {@link
  20. * java.lang.CharSequence </code>character sequences<code>} against the regular
  21. * expression. All of the state involved in performing a match resides in the
  22. * matcher, so many matchers can share the same pattern.
  23. *
  24. * <p> A typical invocation sequence is thus
  25. *
  26. * <blockquote><pre>
  27. * Pattern p = Pattern.{@link #compile compile}("a*b");
  28. * Matcher m = p.{@link #matcher matcher}("aaaaab");
  29. * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
  30. *
  31. * <p> A {@link #matches matches} method is defined by this class as a
  32. * convenience for when a regular expression is used just once. This method
  33. * compiles an expression and matches an input sequence against it in a single
  34. * invocation. The statement
  35. *
  36. * <blockquote><pre>
  37. * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>
  38. *
  39. * is equivalent to the three statements above, though for repeated matches it
  40. * is less efficient since it does not allow the compiled pattern to be reused.
  41. *
  42. * <p> Instances of this class are immutable and are safe for use by multiple
  43. * concurrent threads. Instances of the {@link Matcher} class are not safe for
  44. * such use.
  45. *
  46. *
  47. * <a name="sum">
  48. * <h4> Summary of regular-expression constructs </h4>
  49. *
  50. * <table border="0" cellpadding="1" cellspacing="0"
  51. * summary="Regular expression constructs, and what they match">
  52. *
  53. * <tr align="left">
  54. * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th>
  55. * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
  56. * </tr>
  57. *
  58. * <tr><th> </th></tr>
  59. * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>
  60. *
  61. * <tr><td valign="top" headers="construct characters"><i>x</i></td>
  62. * <td headers="matches">The character <i>x</i></td></tr>
  63. * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td>
  64. * <td headers="matches">The backslash character</td></tr>
  65. * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td>
  66. * <td headers="matches">The character with octal value <tt>0</tt><i>n</i>
  67. * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
  68. * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td>
  69. * <td headers="matches">The character with octal value <tt>0</tt><i>nn</i>
  70. * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
  71. * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td>
  72. * <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i>
  73. * (0 <tt><=</tt> <i>m</i> <tt><=</tt> 3,
  74. * 0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
  75. * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td>
  76. * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hh</i></td></tr>
  77. * <tr><td valign="top" headers="construct characters"><tt>\u</tt><i>hhhh</i></td>
  78. * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hhhh</i></td></tr>
  79. * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
  80. * <td headers="matches">The tab character (<tt>'\u0009'</tt>)</td></tr>
  81. * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
  82. * <td headers="matches">The newline (line feed) character (<tt>'\u000A'</tt>)</td></tr>
  83. * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td>
  84. * <td headers="matches">The carriage-return character (<tt>'\u000D'</tt>)</td></tr>
  85. * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td>
  86. * <td headers="matches">The form-feed character (<tt>'\u000C'</tt>)</td></tr>
  87. * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td>
  88. * <td headers="matches">The alert (bell) character (<tt>'\u0007'</tt>)</td></tr>
  89. * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td>
  90. * <td headers="matches">The escape character (<tt>'\u001B'</tt>)</td></tr>
  91. * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td>
  92. * <td headers="matches">The control character corresponding to <i>x</i></td></tr>
  93. *
  94. * <tr><th> </th></tr>
  95. * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
  96. *
  97. * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td>
  98. * <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr>
  99. * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td>
  100. * <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr>
  101. * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td>
  102. * <td headers="matches"><tt>a</tt> through <tt>z</tt>
  103. * or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr>
  104. * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td>
  105. * <td headers="matches"><tt>a</tt> through <tt>d</tt>,
  106. * or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr>
  107. * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td>
  108. * <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr>
  109. * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td>
  110. * <td headers="matches"><tt>a</tt> through <tt>z</tt>,
  111. * except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr>
  112. * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td>
  113. * <td headers="matches"><tt>a</tt> through <tt>z</tt>,
  114. * and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr>
  115. * <tr><th> </th></tr>
  116. *
  117. * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>
  118. *
  119. * <tr><td valign="top" headers="construct predef"><tt>.</tt></td>
  120. * <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
  121. * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td>
  122. * <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
  123. * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
  124. * <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
  125. * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
  126. * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
  127. * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
  128. * <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
  129. * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
  130. * <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
  131. * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
  132. * <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
  133. *
  134. * <tr><th> </th></tr>
  135. * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
  136. *
  137. * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td>
  138. * <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr>
  139. * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td>
  140. * <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr>
  141. * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td>
  142. * <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
  143. * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td>
  144. * <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr>
  145. * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td>
  146. * <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr>
  147. * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td>
  148. * <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr>
  149. * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td>
  150. * <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr>
  151. * <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt>
  152. * <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> -->
  153. * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td>
  154. * <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr>
  155. * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td>
  156. * <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr>
  157. * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td>
  158. * <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr>
  159. * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
  160. * <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</td></tr>
  161. * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
  162. * <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
  163. * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
  164. * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
  165. *
  166. * <tr><th> </th></tr>
  167. * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
  168. *
  169. * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
  170. * <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
  171. * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
  172. * <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
  173. * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
  174. * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
  175. * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
  176. * <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
  177. *
  178. * <tr><th> </th></tr>
  179. * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode blocks and categories</th></tr>
  180. *
  181. * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
  182. * <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
  183. * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
  184. * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
  185. * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
  186. * <td headers="matches">A currency symbol</td></tr>
  187. * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
  188. * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
  189. * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
  190. * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
  191. *
  192. * <tr><th> </th></tr>
  193. * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
  194. *
  195. * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
  196. * <td headers="matches">The beginning of a line</td></tr>
  197. * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
  198. * <td headers="matches">The end of a line</td></tr>
  199. * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
  200. * <td headers="matches">A word boundary</td></tr>
  201. * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
  202. * <td headers="matches">A non-word boundary</td></tr>
  203. * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
  204. * <td headers="matches">The beginning of the input</td></tr>
  205. * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td>
  206. * <td headers="matches">The end of the previous match</td></tr>
  207. * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td>
  208. * <td headers="matches">The end of the input but for the final
  209. * <a href="#lt">terminator</a>, if any</td></tr>
  210. * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td>
  211. * <td headers="matches">The end of the input</td></tr>
  212. *
  213. * <tr><th> </th></tr>
  214. * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
  215. *
  216. * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
  217. * <td headers="matches"><i>X</i>, once or not at all</td></tr>
  218. * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td>
  219. * <td headers="matches"><i>X</i>, zero or more times</td></tr>
  220. * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td>
  221. * <td headers="matches"><i>X</i>, one or more times</td></tr>
  222. * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td>
  223. * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
  224. * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td>
  225. * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
  226. * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td>
  227. * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
  228. *
  229. * <tr><th> </th></tr>
  230. * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
  231. *
  232. * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td>
  233. * <td headers="matches"><i>X</i>, once or not at all</td></tr>
  234. * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td>
  235. * <td headers="matches"><i>X</i>, zero or more times</td></tr>
  236. * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td>
  237. * <td headers="matches"><i>X</i>, one or more times</td></tr>
  238. * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td>
  239. * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
  240. * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td>
  241. * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
  242. * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td>
  243. * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
  244. *
  245. * <tr><th> </th></tr>
  246. * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>
  247. *
  248. * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td>
  249. * <td headers="matches"><i>X</i>, once or not at all</td></tr>
  250. * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td>
  251. * <td headers="matches"><i>X</i>, zero or more times</td></tr>
  252. * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td>
  253. * <td headers="matches"><i>X</i>, one or more times</td></tr>
  254. * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td>
  255. * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
  256. * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td>
  257. * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
  258. * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
  259. * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
  260. *
  261. * <tr><th> </th></tr>
  262. * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
  263. *
  264. * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
  265. * <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
  266. * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
  267. * <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
  268. * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
  269. * <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
  270. *
  271. * <tr><th> </th></tr>
  272. * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
  273. *
  274. * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
  275. * <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
  276. * <a href="#cg">capturing group</a> matched</td></tr>
  277. *
  278. * <tr><th> </th></tr>
  279. * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
  280. *
  281. * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
  282. * <td headers="matches">Nothing, but quotes the following character</tt></td></tr>
  283. * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
  284. * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
  285. * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
  286. * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
  287. * <!-- Metachars: !$()*+.<>?[\]^{|} -->
  288. *
  289. * <tr><th> </th></tr>
  290. * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
  291. *
  292. * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
  293. * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
  294. * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
  295. * <td headers="matches">Nothing, but turns match flags on - off</td></tr>
  296. * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>  </td>
  297. * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
  298. * given flags on - off</td></tr>
  299. * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
  300. * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
  301. * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
  302. * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
  303. * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
  304. * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
  305. * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
  306. * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
  307. * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
  308. * <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
  309. *
  310. * </table>
  311. *
  312. * <hr>
  313. *
  314. *
  315. * <a name="bs">
  316. * <h4> Backslashes, escapes, and quoting </h4>
  317. *
  318. * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped
  319. * constructs, as defined in the table above, as well as to quote characters
  320. * that otherwise would be interpreted as unescaped constructs. Thus the
  321. * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
  322. * left brace.
  323. *
  324. * <p> It is an error to use a backslash prior to any alphabetic character that
  325. * does not denote an escaped construct; these are reserved for future
  326. * extensions to the regular-expression language. A backslash may be used
  327. * prior to a non-alphabetic character regardless of whether that character is
  328. * part of an unescaped construct.
  329. *
  330. * <p> Backslashes within string literals in Java source code are interpreted
  331. * as required by the <a
  332. * href="http://java.sun.com/docs/books/jls/second_edition/html/">Java Language
  333. * Specification</a> as either <a
  334. * href="http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850">Unicode
  335. * escapes</a> or other <a
  336. * href="http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#101089">character
  337. * escapes</a>. It is therefore necessary to double backslashes in string
  338. * literals that represent regular expressions to protect them from
  339. * interpretation by the Java bytecode compiler. The string literal
  340. * <tt>"\b"</tt>, for example, matches a single backspace character when
  341. * interpreted as a regular expression, while <tt>"\\b"</tt> matches a
  342. * word boundary. The string literal <tt>"\(hello\)"</tt> is illegal
  343. * and leads to a compile-time error; in order to match the string
  344. * <tt>(hello)</tt> the string literal <tt>"\\(hello\\)"</tt>
  345. * must be used.
  346. *
  347. * <a name="cc">
  348. * <h4> Character Classes </h4>
  349. *
  350. * <p> Character classes may appear within other character classes, and
  351. * may be composed by the union operator (implicit) and the intersection
  352. * operator (<tt>&&</tt>).
  353. * The union operator denotes a class that contains every character that is
  354. * in at least one of its operand classes. The intersection operator
  355. * denotes a class that contains every character that is in both of its
  356. * operand classes.
  357. *
  358. * <p> The precedence of character-class operators is as follows, from
  359. * highest to lowest:
  360. *
  361. * <blockquote><table border="0" cellpadding="1" cellspacing="0"
  362. * summary="Precedence of character class operators.">
  363. * <tr><th>1    </th>
  364. * <td>Literal escape    </td>
  365. * <td><tt>\x</tt></td></tr>
  366. * <tr><th>2    </th>
  367. * <td>Grouping</td>
  368. * <td><tt>[...]</tt></td></tr>
  369. * <tr><th>3    </th>
  370. * <td>Range</td>
  371. * <td><tt>a-z</tt></td></tr>
  372. * <tr><th>4    </th>
  373. * <td>Union</td>
  374. * <td><tt>[a-e][i-u]<tt></td></tr>
  375. * <tr><th>5    </th>
  376. * <td>Intersection</td>
  377. * <td><tt>[a-z&&[aeiou]]</tt></td></tr>
  378. * </table></blockquote>
  379. *
  380. * <p> Note that a different set of metacharacters are in effect inside
  381. * a character class than outside a character class. For instance, the
  382. * regular expression <tt>.</tt> loses its special meaning inside a
  383. * character class, while the expression <tt>-</tt> becomes a range
  384. * forming metacharacter.
  385. *
  386. * <a name="lt">
  387. * <h4> Line terminators </h4>
  388. *
  389. * <p> A <i>line terminator</i> is a one- or two-character sequence that marks
  390. * the end of a line of the input character sequence. The following are
  391. * recognized as line terminators:
  392. *
  393. * <ul>
  394. *
  395. * <li> A newline (line feed) character (<tt>'\n'</tt>),
  396. *
  397. * <li> A carriage-return character followed immediately by a newline
  398. * character (<tt>"\r\n"</tt>),
  399. *
  400. * <li> A standalone carriage-return character (<tt>'\r'</tt>),
  401. *
  402. * <li> A next-line character (<tt>'\u0085'</tt>),
  403. *
  404. * <li> A line-separator character (<tt>'\u2028'</tt>), or
  405. *
  406. * <li> A paragraph-separator character (<tt>'\u2029</tt>).
  407. *
  408. * </ul>
  409. * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
  410. * recognized are newline characters.
  411. *
  412. * <p> The regular expression <tt>.</tt> matches any character except a line
  413. * terminator unless the {@link #DOTALL} flag is specified.
  414. *
  415. * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
  416. * line terminators and only match at the beginning and the end, respectively,
  417. * of the entire input sequence. If {@link #MULTILINE} mode is activated then
  418. * <tt>^</tt> matches at the beginning of input and after any line terminator
  419. * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
  420. * matches just before a line terminator or the end of the input sequence.
  421. *
  422. * <a name="cg">
  423. * <h4> Groups and capturing </h4>
  424. *
  425. * <p> Capturing groups are numbered by counting their opening parentheses from
  426. * left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
  427. * are four such groups: </p>
  428. *
  429. * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
  430. * <tr><th>1    </th>
  431. * <td><tt>((A)(B(C)))</tt></td></tr>
  432. * <tr><th>2    </th>
  433. * <td><tt>(A)</tt></td></tr>
  434. * <tr><th>3    </th>
  435. * <td><tt>(B(C))</tt></td></tr>
  436. * <tr><th>4    </th>
  437. * <td><tt>(C)</tt></td></tr>
  438. * </table></blockquote>
  439. *
  440. * <p> Group zero always stands for the entire expression.
  441. *
  442. * <p> Capturing groups are so named because, during a match, each subsequence
  443. * of the input sequence that matches such a group is saved. The captured
  444. * subsequence may be used later in the expression, via a back reference, and
  445. * may also be retrieved from the matcher once the match operation is complete.
  446. *
  447. * <p> The captured input associated with a group is always the subsequence
  448. * that the group most recently matched. If a group is evaluated a second time
  449. * because of quantification then its previously-captured value, if any, will
  450. * be retained if the second evaluation fails. Matching the string
  451. * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
  452. * group two set to <tt>"b"</tt>. All captured input is discarded at the
  453. * beginning of each match.
  454. *
  455. * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
  456. * that do not capture text and do not count towards the group total.
  457. *
  458. *
  459. * <h4> Unicode support </h4>
  460. *
  461. * <p> This class is in conformance with Level 1 of <a
  462. * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
  463. * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
  464. * Canonical Equivalents.
  465. *
  466. * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
  467. * are processed as described in <a
  468. * href="http://java.sun.com/docs/books/jls/second_edition/html/lexical.doc.html#100850">\u00A73.3</a>
  469. * of the Java Language Specification. Such escape sequences are also
  470. * implemented directly by the regular-expression parser so that Unicode
  471. * escapes can be used in expressions that are read from files or from the
  472. * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
  473. * while not equal, compile into the same pattern, which matches the character
  474. * with hexadecimal value <tt>0x2014</tt>.
  475. *
  476. * <a name="ubc"> <p>Unicode blocks and categories are written with the
  477. * <tt>\p</tt> and <tt>\P</tt> constructs as in
  478. * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
  479. * property <i>prop</i>, while \P{</tt><i>prop</i><tt>}</tt> does not match if
  480. * the input has that property. Blocks are specified with the prefix
  481. * <tt>In</tt>, as in <tt>InMongolian</tt>. Categories may be specified with
  482. * the optional prefix <tt>Is</tt>: Both <tt>\p{L}</tt> and <tt>\p{IsL}</tt>
  483. * denote the category of Unicode letters. Blocks and categories can be used
  484. * both inside and outside of a character class.
  485. *
  486. * <p> The supported categories are those of
  487. * <a href="http://www.unicode.org/unicode/standard/standard.html">
  488. * <i>The Unicode Standard</i></a> in the version specified by the
  489. * {@link java.lang.Character Character} class. The category names are those
  490. * defined in the Standard, both normative and informative.
  491. * The block names supported by <code>Pattern</code> are the valid block names
  492. * accepted and defined by
  493. * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
  494. *
  495. * <a name="jcc"> <p>Categories that behave like the java.lang.Character
  496. * boolean is<i>methodname</i> methods (except for the deprecated ones) are
  497. * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
  498. * the specified property has the name <tt>java<i>methodname</i></tt>.
  499. *
  500. * <h4> Comparison to Perl 5 </h4>
  501. *
  502. * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
  503. * with ordered alternation as occurs in Perl 5.
  504. *
  505. * <p> Perl constructs not supported by this class: </p>
  506. *
  507. * <ul>
  508. *
  509. * <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
  510. * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
  511. * </p></li>
  512. *
  513. * <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
  514. * and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
  515. *
  516. * <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
  517. *
  518. * <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>,
  519. * <tt>\L</tt>, and <tt>\U</tt>. </p></li>
  520. *
  521. * </ul>
  522. *
  523. * <p> Constructs supported by this class but not by Perl: </p>
  524. *
  525. * <ul>
  526. *
  527. * <li><p> Possessive quantifiers, which greedily match as much as they can
  528. * and do not back off, even when doing so would allow the overall match to
  529. * succeed. </p></li>
  530. *
  531. * <li><p> Character-class union and intersection as described
  532. * <a href="#cc">above</a>.</p></li>
  533. *
  534. * </ul>
  535. *
  536. * <p> Notable differences from Perl: </p>
  537. *
  538. * <ul>
  539. *
  540. * <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
  541. * as back references; a backslash-escaped number greater than <tt>9</tt> is
  542. * treated as a back reference if at least that many subexpressions exist,
  543. * otherwise it is interpreted, if possible, as an octal escape. In this
  544. * class octal escapes must always begin with a zero. In this class,
  545. * <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
  546. * references, and a larger number is accepted as a back reference if at
  547. * least that many subexpressions exist at that point in the regular
  548. * expression, otherwise the parser will drop digits until the number is
  549. * smaller or equal to the existing number of groups or it is one digit.
  550. * </p></li>
  551. *
  552. * <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
  553. * where the last match left off. This functionality is provided implicitly
  554. * by the {@link Matcher} class: Repeated invocations of the {@link
  555. * Matcher#find find} method will resume where the last match left off,
  556. * unless the matcher is reset. </p></li>
  557. *
  558. * <li><p> In Perl, embedded flags at the top level of an expression affect
  559. * the whole expression. In this class, embedded flags always take effect
  560. * at the point at which they appear, whether they are at the top level or
  561. * within a group; in the latter case, flags are restored at the end of the
  562. * group just as in Perl. </p></li>
  563. *
  564. * <li><p> Perl is forgiving about malformed matching constructs, as in the
  565. * expression <tt>*a</tt>, as well as dangling brackets, as in the
  566. * expression <tt>abc]</tt>, and treats them as literals. This
  567. * class also accepts dangling brackets but is strict about dangling
  568. * metacharacters like +, ? and *, and will throw a
  569. * {@link PatternSyntaxException} if it encounters them. </p></li>
  570. *
  571. * </ul>
  572. *
  573. *
  574. * <p> For a more precise description of the behavior of regular expression
  575. * constructs, please see <a href="http://www.oreilly.com/catalog/regex2/">
  576. * <i>Mastering Regular Expressions, 2nd Edition</i>, Jeffrey E. F. Friedl,
  577. * O'Reilly and Associates, 2002.</a>
  578. * </p>
  579. *
  580. * @see java.lang.String#split(String, int)
  581. * @see java.lang.String#split(String)
  582. *
  583. * @author Mike McCloskey
  584. * @author Mark Reinhold
  585. * @author JSR-51 Expert Group
  586. * @version 1.109, 04/06/28
  587. * @since 1.4
  588. * @spec JSR-51
  589. */
  590. public final class Pattern
  591. implements java.io.Serializable
  592. {
  593. /**
  594. * Regular expression modifier values. Instead of being passed as
  595. * arguments, they can also be passed as inline modifiers.
  596. * For example, the following statements have the same effect.
  597. * <pre>
  598. * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
  599. * RegExp r2 = RegExp.compile("(?im)abc", 0);
  600. * </pre>
  601. *
  602. * The flags are duplicated so that the familiar Perl match flag
  603. * names are available.
  604. */
  605. /**
  606. * Enables Unix lines mode.
  607. *
  608. * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized
  609. * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.
  610. *
  611. * <p> Unix lines mode can also be enabled via the embedded flag
  612. * expression <tt>(?d)</tt>.
  613. */
  614. public static final int UNIX_LINES = 0x01;
  615. /**
  616. * Enables case-insensitive matching.
  617. *
  618. * <p> By default, case-insensitive matching assumes that only characters
  619. * in the US-ASCII charset are being matched. Unicode-aware
  620. * case-insensitive matching can be enabled by specifying the {@link
  621. * #UNICODE_CASE} flag in conjunction with this flag.
  622. *
  623. * <p> Case-insensitive matching can also be enabled via the embedded flag
  624. * expression <tt>(?i)</tt>.
  625. *
  626. * <p> Specifying this flag may impose a slight performance penalty. </p>
  627. */
  628. public static final int CASE_INSENSITIVE = 0x02;
  629. /**
  630. * Permits whitespace and comments in pattern.
  631. *
  632. * <p> In this mode, whitespace is ignored, and embedded comments starting
  633. * with <tt>#</tt> are ignored until the end of a line.
  634. *
  635. * <p> Comments mode can also be enabled via the embedded flag
  636. * expression <tt>(?x)</tt>.
  637. */
  638. public static final int COMMENTS = 0x04;
  639. /**
  640. * Enables multiline mode.
  641. *
  642. * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match
  643. * just after or just before, respectively, a line terminator or the end of
  644. * the input sequence. By default these expressions only match at the
  645. * beginning and the end of the entire input sequence.
  646. *
  647. * <p> Multiline mode can also be enabled via the embedded flag
  648. * expression <tt>(?m)</tt>. </p>
  649. */
  650. public static final int MULTILINE = 0x08;
  651. /**
  652. * Enables literal parsing of the pattern.
  653. *
  654. * <p> When this flag is specified then the input string that specifies
  655. * the pattern is treated as a sequence of literal characters.
  656. * Metacharacters or escape sequences in the input sequence will be
  657. * given no special meaning.
  658. *
  659. * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
  660. * matching when used in conjunction with this flag. The other flags
  661. * become superfluous.
  662. *
  663. * <p> There is no embedded flag character for enabling literal parsing.
  664. */
  665. public static final int LITERAL = 0x10;
  666. /**
  667. * Enables dotall mode.
  668. *
  669. * <p> In dotall mode, the expression <tt>.</tt> matches any character,
  670. * including a line terminator. By default this expression does not match
  671. * line terminators.
  672. *
  673. * <p> Dotall mode can also be enabled via the embedded flag
  674. * expression <tt>(?s)</tt>. (The <tt>s</tt> is a mnemonic for
  675. * "single-line" mode, which is what this is called in Perl.) </p>
  676. */
  677. public static final int DOTALL = 0x20;
  678. /**
  679. * Enables Unicode-aware case folding.
  680. *
  681. * <p> When this flag is specified then case-insensitive matching, when
  682. * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
  683. * consistent with the Unicode Standard. By default, case-insensitive
  684. * matching assumes that only characters in the US-ASCII charset are being
  685. * matched.
  686. *
  687. * <p> Unicode-aware case folding can also be enabled via the embedded flag
  688. * expression <tt>(?u)</tt>.
  689. *
  690. * <p> Specifying this flag may impose a performance penalty. </p>
  691. */
  692. public static final int UNICODE_CASE = 0x40;
  693. /**
  694. * Enables canonical equivalence.
  695. *
  696. * <p> When this flag is specified then two characters will be considered
  697. * to match if, and only if, their full canonical decompositions match.
  698. * The expression <tt>"a\u030A"</tt>, for example, will match the
  699. * string <tt>"\u00E5"</tt> when this flag is specified. By default,
  700. * matching does not take canonical equivalence into account.
  701. *
  702. * <p> There is no embedded flag character for enabling canonical
  703. * equivalence.
  704. *
  705. * <p> Specifying this flag may impose a performance penalty. </p>
  706. */
  707. public static final int CANON_EQ = 0x80;
  708. /* Pattern has only two serialized components: The pattern string
  709. * and the flags, which are all that is needed to recompile the pattern
  710. * when it is deserialized.
  711. */
  712. /** use serialVersionUID from Merlin b59 for interoperability */
  713. private static final long serialVersionUID = 5073258162644648461L;
  714. /**
  715. * The original regular-expression pattern string.
  716. *
  717. * @serial
  718. */
  719. private String pattern;
  720. /**
  721. * The original pattern flags.
  722. *
  723. * @serial
  724. */
  725. private int flags;
  726. /**
  727. * Boolean indicating this Pattern is compiled; this is necessary in order
  728. * to lazily compile deserialized Patterns.
  729. */
  730. private transient volatile boolean compiled = false;
  731. /**
  732. * The normalized pattern string.
  733. */
  734. private transient String normalizedPattern;
  735. /**
  736. * The starting point of state machine for the find operation. This allows
  737. * a match to start anywhere in the input.
  738. */
  739. transient Node root;
  740. /**
  741. * The root of object tree for a match operation. The pattern is matched
  742. * at the beginning. This may include a find that uses BnM or a First
  743. * node.
  744. */
  745. transient Node matchRoot;
  746. /**
  747. * Temporary storage used by parsing pattern slice.
  748. */
  749. transient int[] buffer;
  750. /**
  751. * Temporary storage used while parsing group references.
  752. */
  753. transient GroupHead[] groupNodes;
  754. /**
  755. * Temporary null terminating char array used by pattern compiling.
  756. */
  757. private transient int[] temp;
  758. /**
  759. * The number of capturing groups in this Pattern. Used by matchers to
  760. * allocate storage needed to perform a match.
  761. */
  762. transient int capturingGroupCount;
  763. /**
  764. * The local variable count used by parsing tree. Used by matchers to
  765. * allocate storage needed to perform a match.
  766. */
  767. transient int localCount;
  768. /**
  769. * Index into the pattern string that keeps track of how much has been
  770. * parsed.
  771. */
  772. private transient int cursor;
  773. /**
  774. * Holds the length of the pattern string.
  775. */
  776. private transient int patternLength;
  777. /**
  778. * Compiles the given regular expression into a pattern. </p>
  779. *
  780. * @param regex
  781. * The expression to be compiled
  782. *
  783. * @throws PatternSyntaxException
  784. * If the expression's syntax is invalid
  785. */
  786. public static Pattern compile(String regex) {
  787. return new Pattern(regex, 0);
  788. }
  789. /**
  790. * Compiles the given regular expression into a pattern with the given
  791. * flags. </p>
  792. *
  793. * @param regex
  794. * The expression to be compiled
  795. *
  796. * @param flags
  797. * Match flags, a bit mask that may include
  798. * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
  799. * {@link #UNICODE_CASE}, and {@link #CANON_EQ}
  800. *
  801. * @throws IllegalArgumentException
  802. * If bit values other than those corresponding to the defined
  803. * match flags are set in <tt>flags</tt>
  804. *
  805. * @throws PatternSyntaxException
  806. * If the expression's syntax is invalid
  807. */
  808. public static Pattern compile(String regex, int flags) {
  809. return new Pattern(regex, flags);
  810. }
  811. /**
  812. * Returns the regular expression from which this pattern was compiled.
  813. * </p>
  814. *
  815. * @return The source of this pattern
  816. */
  817. public String pattern() {
  818. return pattern;
  819. }
  820. /**
  821. * <p>Returns the string representation of this pattern. This
  822. * is the regular expression from which this pattern was
  823. * compiled.</p>
  824. *
  825. * @return The string representation of this pattern
  826. * @since 1.5
  827. */
  828. public String toString() {
  829. return pattern;
  830. }
  831. /**
  832. * Creates a matcher that will match the given input against this pattern.
  833. * </p>
  834. *
  835. * @param input
  836. * The character sequence to be matched
  837. *
  838. * @return A new matcher for this pattern
  839. */
  840. public Matcher matcher(CharSequence input) {
  841. synchronized(this) {
  842. if (!compiled)
  843. compile();
  844. }
  845. Matcher m = new Matcher(this, input);
  846. return m;
  847. }
  848. /**
  849. * Returns this pattern's match flags. </p>
  850. *
  851. * @return The match flags specified when this pattern was compiled
  852. */
  853. public int flags() {
  854. return flags;
  855. }
  856. /**
  857. * Compiles the given regular expression and attempts to match the given
  858. * input against it.
  859. *
  860. * <p> An invocation of this convenience method of the form
  861. *
  862. * <blockquote><pre>
  863. * Pattern.matches(regex, input);</pre></blockquote>
  864. *
  865. * behaves in exactly the same way as the expression
  866. *
  867. * <blockquote><pre>
  868. * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
  869. *
  870. * <p> If a pattern is to be used multiple times, compiling it once and reusing
  871. * it will be more efficient than invoking this method each time. </p>
  872. *
  873. * @param regex
  874. * The expression to be compiled
  875. *
  876. * @param input
  877. * The character sequence to be matched
  878. *
  879. * @throws PatternSyntaxException
  880. * If the expression's syntax is invalid
  881. */
  882. public static boolean matches(String regex, CharSequence input) {
  883. Pattern p = Pattern.compile(regex);
  884. Matcher m = p.matcher(input);
  885. return m.matches();
  886. }
  887. /**
  888. * Splits the given input sequence around matches of this pattern.
  889. *
  890. * <p> The array returned by this method contains each substring of the
  891. * input sequence that is terminated by another subsequence that matches
  892. * this pattern or is terminated by the end of the input sequence. The
  893. * substrings in the array are in the order in which they occur in the
  894. * input. If this pattern does not match any subsequence of the input then
  895. * the resulting array has just one element, namely the input sequence in
  896. * string form.
  897. *
  898. * <p> The <tt>limit</tt> parameter controls the number of times the
  899. * pattern is applied and therefore affects the length of the resulting
  900. * array. If the limit <i>n</i> is greater than zero then the pattern
  901. * will be applied at most <i>n</i> - 1 times, the array's
  902. * length will be no greater than <i>n</i>, and the array's last entry
  903. * will contain all input beyond the last matched delimiter. If <i>n</i>
  904. * is non-positive then the pattern will be applied as many times as
  905. * possible and the array can have any length. If <i>n</i> is zero then
  906. * the pattern will be applied as many times as possible, the array can
  907. * have any length, and trailing empty strings will be discarded.
  908. *
  909. * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
  910. * results with these parameters:
  911. *
  912. * <blockquote><table cellpadding=1 cellspacing=0
  913. * summary="Split examples showing regex, limit, and result">
  914. * <tr><th><P align="left"><i>Regex    </i></th>
  915. * <th><P align="left"><i>Limit    </i></th>
  916. * <th><P align="left"><i>Result    </i></th></tr>
  917. * <tr><td align=center>:</td>
  918. * <td align=center>2</td>
  919. * <td><tt>{ "boo", "and:foo" }</tt></td></tr>
  920. * <tr><td align=center>:</td>
  921. * <td align=center>5</td>
  922. * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  923. * <tr><td align=center>:</td>
  924. * <td align=center>-2</td>
  925. * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  926. * <tr><td align=center>o</td>
  927. * <td align=center>5</td>
  928. * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
  929. * <tr><td align=center>o</td>
  930. * <td align=center>-2</td>
  931. * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
  932. * <tr><td align=center>o</td>
  933. * <td align=center>0</td>
  934. * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
  935. * </table></blockquote>
  936. *
  937. *
  938. * @param input
  939. * The character sequence to be split
  940. *
  941. * @param limit
  942. * The result threshold, as described above
  943. *
  944. * @return The array of strings computed by splitting the input
  945. * around matches of this pattern
  946. */
  947. public String[] split(CharSequence input, int limit) {
  948. int index = 0;
  949. boolean matchLimited = limit > 0;
  950. ArrayList matchList = new ArrayList();
  951. Matcher m = matcher(input);
  952. // Add segments before each match found
  953. while(m.find()) {
  954. if (!matchLimited || matchList.size() < limit - 1) {
  955. String match = input.subSequence(index, m.start()).toString();
  956. matchList.add(match);
  957. index = m.end();
  958. } else if (matchList.size() == limit - 1) { // last one
  959. String match = input.subSequence(index,
  960. input.length()).toString();
  961. matchList.add(match);
  962. index = m.end();
  963. }
  964. }
  965. // If no match was found, return this
  966. if (index == 0)
  967. return new String[] {input.toString()};
  968. // Add remaining segment
  969. if (!matchLimited || matchList.size() < limit)
  970. matchList.add(input.subSequence(index, input.length()).toString());
  971. // Construct result
  972. int resultSize = matchList.size();
  973. if (limit == 0)
  974. while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
  975. resultSize--;
  976. String[] result = new String[resultSize];
  977. return (String[])matchList.subList(0, resultSize).toArray(result);
  978. }
  979. /**
  980. * Splits the given input sequence around matches of this pattern.
  981. *
  982. * <p> This method works as if by invoking the two-argument {@link
  983. * #split(java.lang.CharSequence, int) split} method with the given input
  984. * sequence and a limit argument of zero. Trailing empty strings are
  985. * therefore not included in the resulting array. </p>
  986. *
  987. * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
  988. * results with these expressions:
  989. *
  990. * <blockquote><table cellpadding=1 cellspacing=0
  991. * summary="Split examples showing regex and result">
  992. * <tr><th><P align="left"><i>Regex    </i></th>
  993. * <th><P align="left"><i>Result</i></th></tr>
  994. * <tr><td align=center>:</td>
  995. * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  996. * <tr><td align=center>o</td>
  997. * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
  998. * </table></blockquote>
  999. *
  1000. *
  1001. * @param input
  1002. * The character sequence to be split
  1003. *
  1004. * @return The array of strings computed by splitting the input
  1005. * around matches of this pattern
  1006. */
  1007. public String[] split(CharSequence input) {
  1008. return split(input, 0);
  1009. }
  1010. /**
  1011. * Returns a literal pattern <code>String</code> for the specified
  1012. * <code>String</code>.
  1013. *
  1014. * <p>This method produces a <code>String</code> that can be used to
  1015. * create a <code>Pattern</code> that would match the string
  1016. * <code>s</code> as if it were a literal pattern.</p> Metacharacters
  1017. * or escape sequences in the input sequence will be given no special
  1018. * meaning.
  1019. *
  1020. * @param s The string to be literalized
  1021. * @return A literal string replacement
  1022. * @since 1.5
  1023. */
  1024. public static String quote(String s) {
  1025. int slashEIndex = s.indexOf("\\E");
  1026. if (slashEIndex == -1)
  1027. return "\\Q" + s + "\\E";
  1028. StringBuilder sb = new StringBuilder(s.length() * 2);
  1029. sb.append("\\Q");
  1030. slashEIndex = 0;
  1031. int current = 0;
  1032. while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
  1033. sb.append(s.substring(current, slashEIndex));
  1034. current = slashEIndex + 2;
  1035. sb.append("\\E\\\\E\\Q");
  1036. }
  1037. sb.append(s.substring(current, s.length()));
  1038. sb.append("\\E");
  1039. return sb.toString();
  1040. }
  1041. /**
  1042. * Recompile the Pattern instance from a stream. The original pattern
  1043. * string is read in and the object tree is recompiled from it.
  1044. */
  1045. private void readObject(java.io.ObjectInputStream s)
  1046. throws java.io.IOException, ClassNotFoundException {
  1047. // Read in all fields
  1048. s.defaultReadObject();
  1049. // Initialize counts
  1050. capturingGroupCount = 1;
  1051. localCount = 0;
  1052. // if length > 0, the Pattern is lazily compiled
  1053. compiled = false;
  1054. if (pattern.length() == 0) {
  1055. root = new Start(lastAccept);
  1056. matchRoot = lastAccept;
  1057. compiled = true;
  1058. }
  1059. }
  1060. /**
  1061. * This private constructor is used to create all Patterns. The pattern
  1062. * string and match flags are all that is needed to completely describe
  1063. * a Pattern. An empty pattern string results in an object tree with
  1064. * only a Start node and a LastNode node.
  1065. */
  1066. private Pattern(String p, int f) {
  1067. pattern = p;
  1068. flags = f;
  1069. // Reset group index count
  1070. capturingGroupCount = 1;
  1071. localCount = 0;
  1072. if (pattern.length() > 0) {
  1073. compile();
  1074. } else {
  1075. root = new Start(lastAccept);
  1076. matchRoot = lastAccept;
  1077. }
  1078. }
  1079. /**
  1080. * The pattern is converted to normalizedD form and then a pure group
  1081. * is constructed to match canonical equivalences of the characters.
  1082. */
  1083. private void normalize() {
  1084. boolean inCharClass = false;
  1085. int lastCodePoint = -1;
  1086. // Convert pattern into normalizedD form
  1087. normalizedPattern = Normalizer.decompose(pattern, false, 0);
  1088. patternLength = normalizedPattern.length();
  1089. // Modify pattern to match canonical equivalences
  1090. StringBuilder newPattern = new StringBuilder(patternLength);
  1091. for(int i=0; i<patternLength; ) {
  1092. int c = normalizedPattern.codePointAt(i);
  1093. StringBuilder sequenceBuffer;
  1094. if ((Character.getType(c) == Character.NON_SPACING_MARK)
  1095. && (lastCodePoint != -1)) {
  1096. sequenceBuffer = new StringBuilder();
  1097. sequenceBuffer.appendCodePoint(lastCodePoint);
  1098. sequenceBuffer.appendCodePoint(c);
  1099. while(Character.getType(c) == Character.NON_SPACING_MARK) {
  1100. i += Character.charCount(c);
  1101. if (i >= patternLength)
  1102. break;
  1103. c = normalizedPattern.codePointAt(i);
  1104. sequenceBuffer.appendCodePoint(c);
  1105. }
  1106. String ea = produceEquivalentAlternation(
  1107. sequenceBuffer.toString());
  1108. newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
  1109. newPattern.append("(?:").append(ea).append(")");
  1110. } else if (c == '[' && lastCodePoint != '\\') {
  1111. i = normalizeCharClass(newPattern, i);
  1112. } else {
  1113. newPattern.appendCodePoint(c);
  1114. }
  1115. lastCodePoint = c;
  1116. i += Character.charCount(c);
  1117. }
  1118. normalizedPattern = newPattern.toString();
  1119. }
  1120. /**
  1121. * Complete the character class being parsed and add a set
  1122. * of alternations to it that will match the canonical equivalences
  1123. * of the characters within the class.
  1124. */
  1125. private int normalizeCharClass(StringBuilder newPattern, int i) {
  1126. StringBuilder charClass = new StringBuilder();
  1127. StringBuilder eq = null;
  1128. int lastCodePoint = -1;
  1129. String result;
  1130. i++;
  1131. charClass.append("[");
  1132. while(true) {
  1133. int c = normalizedPattern.codePointAt(i);
  1134. StringBuilder sequenceBuffer;
  1135. if (c == ']' && lastCodePoint != '\\') {
  1136. charClass.append((char)c);
  1137. break;
  1138. } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
  1139. sequenceBuffer = new StringBuilder();
  1140. sequenceBuffer.appendCodePoint(lastCodePoint);
  1141. while(Character.getType(c) == Character.NON_SPACING_MARK) {
  1142. sequenceBuffer.appendCodePoint(c);
  1143. i += Character.charCount(c);
  1144. if (i >= normalizedPattern.length())
  1145. break;
  1146. c = normalizedPattern.codePointAt(i);
  1147. }
  1148. String ea = produceEquivalentAlternation(
  1149. sequenceBuffer.toString());
  1150. charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
  1151. if (eq == null)
  1152. eq = new StringBuilder();
  1153. eq.append('|');
  1154. eq.append(ea);
  1155. } else {
  1156. charClass.appendCodePoint(c);
  1157. i++;
  1158. }
  1159. if (i == normalizedPattern.length())
  1160. error("Unclosed character class");
  1161. lastCodePoint = c;
  1162. }
  1163. if (eq != null) {
  1164. result = new String("(?:"+charClass.toString()+
  1165. eq.toString()+")");
  1166. } else {
  1167. result = charClass.toString();
  1168. }
  1169. newPattern.append(result);
  1170. return i;
  1171. }
  1172. /**
  1173. * Given a specific sequence composed of a regular character and
  1174. * combining marks that follow it, produce the alternation that will
  1175. * match all canonical equivalences of that sequence.
  1176. */
  1177. private String produceEquivalentAlternation(String source) {
  1178. int len = countChars(source, 0, 1);
  1179. if (source.length() == len)
  1180. // source has one character.
  1181. return new String(source);
  1182. String base = source.substring(0,len);
  1183. String combiningMarks = source.substring(len);
  1184. String[] perms = producePermutations(combiningMarks);
  1185. StringBuilder result = new StringBuilder(source);
  1186. // Add combined permutations
  1187. for(int x=0; x<perms.length; x++) {
  1188. String next = base + perms[x];
  1189. if (x>0)
  1190. result.append("|"+next);
  1191. next = composeOneStep(next);
  1192. if (next != null)
  1193. result.append("|"+produceEquivalentAlternation(next));
  1194. }
  1195. return result.toString();
  1196. }
  1197. /**
  1198. * Returns an array of strings that have all the possible
  1199. * permutations of the characters in the input string.
  1200. * This is used to get a list of all possible orderings
  1201. * of a set of combining marks. Note that some of the permutations
  1202. * are invalid because of combining class collisions, and these
  1203. * possibilities must be removed because they are not canonically
  1204. * equivalent.
  1205. */
  1206. private String[] producePermutations(String input) {
  1207. if (input.length() == countChars(input, 0, 1))
  1208. return new String[] {input};
  1209. if (input.length() == countChars(input, 0, 2)) {
  1210. int c0 = Character.codePointAt(input, 0);
  1211. int c1 = Character.codePointAt(input, Character.charCount(c0));
  1212. if (getClass(c1) == getClass(c0)) {
  1213. return new String[] {input};
  1214. }
  1215. String[] result = new String[2];
  1216. result[0] = input;
  1217. StringBuilder sb = new StringBuilder(2);
  1218. sb.appendCodePoint(c1);
  1219. sb.appendCodePoint(c0);
  1220. result[1] = sb.toString();
  1221. return result;
  1222. }
  1223. int length = 1;
  1224. int nCodePoints = countCodePoints(input);
  1225. for(int x=1; x<nCodePoints; x++)
  1226. length = length * (x+1);
  1227. String[] temp = new String[length];
  1228. int combClass[] = new int[nCodePoints];
  1229. for(int x=0, i=0; x<nCodePoints; x++) {
  1230. int c = Character.codePointAt(input, i);
  1231. combClass[x] = getClass(c);
  1232. i += Character.charCount(c);
  1233. }
  1234. // For each char, take it out and add the permutations
  1235. // of the remaining chars
  1236. int index = 0;
  1237. int len;
  1238. // offset maintains the index in code units.
  1239. loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
  1240. len = countChars(input, offset, 1);
  1241. boolean skip = false;
  1242. for(int y=x-1; y>=0; y--) {
  1243. if (combClass[y] == combClass[x]) {
  1244. continue loop;
  1245. }
  1246. }
  1247. StringBuilder sb = new StringBuilder(input);
  1248. String otherChars = sb.delete(offset, offset+len).toString();
  1249. String[] subResult = producePermutations(otherChars);
  1250. String prefix = input.substring(offset, offset+len);
  1251. for(int y=0; y<subResult.length; y++)
  1252. temp[index++] = prefix + subResult[y];
  1253. }
  1254. String[] result = new String[index];
  1255. for (int x=0; x<index; x++)
  1256. result[x] = temp[x];
  1257. return result;
  1258. }
  1259. private int getClass(int c) {
  1260. return Normalizer.getClass(c);
  1261. }
  1262. /**
  1263. * Attempts to compose input by combining the first character
  1264. * with the first combining mark following it. Returns a String
  1265. * that is the composition of the leading character with its first
  1266. * combining mark followed by the remaining combining marks. Returns
  1267. * null if the first two characters cannot be further composed.
  1268. */
  1269. private String composeOneStep(String input) {
  1270. int len = countChars(input, 0, 2);
  1271. String firstTwoCharacters = input.substring(0, len);
  1272. String result = Normalizer.compose(firstTwoCharacters, false, 0);
  1273. if (result.equals(firstTwoCharacters))
  1274. return null;
  1275. else {
  1276. String remainder = input.substring(len);
  1277. return result + remainder;
  1278. }
  1279. }
  1280. /**
  1281. * Copies regular expression to a char array and invokes the parsing
  1282. * of the expression which will create the object tree.
  1283. */
  1284. private void compile() {
  1285. // Handle canonical equivalences
  1286. if (has(CANON_EQ) && !has(LITERAL)) {
  1287. normalize();
  1288. } else {
  1289. normalizedPattern = pattern;
  1290. }
  1291. // Copy pattern to char array for convenience
  1292. patternLength = normalizedPattern.length();
  1293. temp = new int[patternLength + 2];
  1294. boolean hasSupplementary = false;
  1295. // Use double null characters to terminate pattern
  1296. int c, count = 0;
  1297. // Convert all chars into code points
  1298. for (int x = 0; x < patternLength; x += Character.charCount(c)) {
  1299. c = normalizedPattern.codePointAt(x);
  1300. if (isSupplementary(c)) {
  1301. hasSupplementary = true;
  1302. }
  1303. temp[count++] = c;
  1304. }
  1305. patternLength = count; // patternLength now in code points
  1306. temp[patternLength] = 0;
  1307. temp[patternLength + 1] = 0;
  1308. // Allocate all temporary objects here.
  1309. buffer = new int[32];
  1310. groupNodes = new GroupHead[10];
  1311. if (has(LITERAL)) {
  1312. // Literal pattern handling
  1313. matchRoot = newSlice(temp, patternLength, hasSupplementary);
  1314. matchRoot.next = lastAccept;
  1315. } else {
  1316. // Start recursive decedent parsing
  1317. matchRoot = expr(lastAccept);
  1318. // Check extra pattern characters
  1319. if (patternLength != cursor) {
  1320. if (peek() == ')') {
  1321. error("Unmatched closing ')'");
  1322. } else {
  1323. error("Unexpected internal error");
  1324. }
  1325. }
  1326. }
  1327. // Peephole optimization
  1328. if (matchRoot instanceof Slice) {
  1329. root = BnM.optimize(matchRoot);
  1330. if (root == matchRoot) {
  1331. root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  1332. }
  1333. } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
  1334. root = matchRoot;
  1335. } else {
  1336. root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  1337. }
  1338. // Release temporary storage
  1339. temp = null;
  1340. buffer = null;
  1341. groupNodes = null;
  1342. patternLength = 0;
  1343. compiled = true;
  1344. }
  1345. /**
  1346. * Used to print out a subtree of the Pattern to help with debugging.
  1347. */
  1348. private static void printObjectTree(Node node) {
  1349. while(node != null) {
  1350. if (node instanceof Prolog) {
  1351. System.out.println(node);
  1352. printObjectTree(((Prolog)node).loop);
  1353. System.out.println("**** end contents prolog loop");
  1354. } else if (node instanceof Loop) {
  1355. System.out.println(node);
  1356. printObjectTree(((Loop)node).body);
  1357. System.out.println("**** end contents Loop body");
  1358. } else if (node instanceof Curly) {
  1359. System.out.println(node);
  1360. printObjectTree(((Curly)node).atom);
  1361. System.out.println("**** end contents Curly body");
  1362. } else if (node instanceof GroupCurly) {
  1363. System.out.println(node);
  1364. printObjectTree(((GroupCurly)node).atom);
  1365. System.out.println("**** end contents GroupCurly body");
  1366. } else if (node instanceof GroupTail) {
  1367. System.out.println(node);
  1368. System.out.println("Tail next is "+node.next);
  1369. return;
  1370. } else {
  1371. System.out.println(node);
  1372. }
  1373. node = node.next;
  1374. if (node != null)
  1375. System.out.println("->next:");
  1376. if (node == Pattern.accept) {
  1377. System.out.println("Accept Node");
  1378. node = null;
  1379. }
  1380. }
  1381. }
  1382. /**
  1383. * Used to accumulate information about a subtree of the object graph
  1384. * so that optimizations can be applied to the subtree.
  1385. */
  1386. static final class TreeInfo {
  1387. int minLength;
  1388. int maxLength;
  1389. boolean maxValid;
  1390. boolean deterministic;
  1391. TreeInfo() {
  1392. reset();
  1393. }
  1394. void reset() {
  1395. minLength = 0;
  1396. maxLength = 0;
  1397. maxValid = true;
  1398. deterministic = true;
  1399. }
  1400. }
  1401. /**
  1402. * The following private methods are mainly used to improve the
  1403. * readability of the code. In order to let the Java compiler easily
  1404. * inline them, we should not put many assertions or error checks in them.
  1405. */
  1406. /**
  1407. * Indicates whether a particular flag is set or not.
  1408. */
  1409. private boolean has(int f) {
  1410. return (flags & f) > 0;
  1411. }
  1412. /**
  1413. * Match next character, signal error if failed.
  1414. */
  1415. private void accept(int ch, String s) {
  1416. int testChar = temp[cursor++];
  1417. if (has(COMMENTS))
  1418. testChar = parsePastWhitespace(testChar);
  1419. if (ch != testChar) {
  1420. error(s);
  1421. }
  1422. }
  1423. /**
  1424. * Mark the end of pattern with a specific character.
  1425. */
  1426. private void mark(int c) {
  1427. temp[patternLength] = c;
  1428. }
  1429. /**
  1430. * Peek the next character, and do not advance the cursor.
  1431. */
  1432. private int peek() {
  1433. int ch = temp[cursor];
  1434. if (has(COMMENTS))
  1435. ch = peekPastWhitespace(ch);
  1436. return ch;
  1437. }
  1438. /**
  1439. * Read the next character, and advance the cursor by one.
  1440. */
  1441. private int read() {
  1442. int ch = temp[cursor++];
  1443. if (has(COMMENTS))
  1444. ch = parsePastWhitespace(ch);
  1445. return ch;
  1446. }
  1447. /**
  1448. * Read the next character, and advance the cursor by one,
  1449. * ignoring the COMMENTS setting
  1450. */
  1451. private int readEscaped() {
  1452. int ch = temp[cursor++];
  1453. return ch;
  1454. }
  1455. /**
  1456. * Advance the cursor by one, and peek the next character.
  1457. */
  1458. private int next() {
  1459. int ch = temp[++cursor];
  1460. if (has(COMMENTS))
  1461. ch = peekPastWhitespace(ch);
  1462. return ch;
  1463. }
  1464. /**
  1465. * Advance the cursor by one, and peek the next character,
  1466. * ignoring the COMMENTS setting
  1467. */
  1468. private int nextEscaped() {
  1469. int ch = temp[++cursor];
  1470. return ch;
  1471. }
  1472. /**
  1473. * If in xmode peek past whitespace and comments.
  1474. */
  1475. private int peekPastWhitespace(int ch) {
  1476. while (ASCII.isSpace(ch) || ch == '#') {
  1477. while (ASCII.isSpace(ch))
  1478. ch = temp[++cursor];
  1479. if (ch == '#') {
  1480. ch = peekPastLine();
  1481. }
  1482. }
  1483. return ch;
  1484. }
  1485. /**
  1486. * If in xmode parse past whitespace and comments.
  1487. */
  1488. private int parsePastWhitespace(int ch) {
  1489. while (ASCII.isSpace(ch) || ch == '#') {
  1490. while (ASCII.isSpace(ch))
  1491. ch = temp[cursor++];
  1492. if (ch == '#')
  1493. ch = parsePastLine();
  1494. }
  1495. return ch;
  1496. }
  1497. /**
  1498. * xmode parse past comment to end of line.
  1499. */
  1500. private int parsePastLine() {
  1501. int ch = temp[cursor++];
  1502. while (ch != 0 && !isLineSeparator(ch))
  1503. ch = temp[cursor++];
  1504. return ch;
  1505. }
  1506. /**
  1507. * xmode peek past comment to end of line.
  1508. */
  1509. private int peekPastLine() {
  1510. int ch = temp[++cursor];
  1511. while (ch != 0 && !isLineSeparator(ch))
  1512. ch = temp[++cursor];
  1513. return ch;
  1514. }
  1515. /**
  1516. * Determines if character is a line separator in the current mode
  1517. */
  1518. private boolean isLineSeparator(int ch) {
  1519. if (has(UNIX_LINES)) {
  1520. return ch == '\n';
  1521. } else {
  1522. return (ch == '\n' ||
  1523. ch == '\r' ||
  1524. (ch|1) == '\u2029' ||
  1525. ch == '\u0085');
  1526. }
  1527. }
  1528. /**
  1529. * Read the character after the next one, and advance the cursor by two.
  1530. */
  1531. private int skip() {
  1532. int i = cursor;
  1533. int ch = temp[i+1];
  1534. cursor = i + 2;
  1535. return ch;
  1536. }
  1537. /**
  1538. * Unread one next character, and retreat cursor by one.
  1539. */
  1540. private void unread() {
  1541. cursor--;
  1542. }
  1543. /**
  1544. * Internal method used for handling all syntax errors. The pattern is
  1545. * displayed with a pointer to aid in locating the syntax error.
  1546. */
  1547. private Node error(String s) {
  1548. throw new PatternSyntaxException(s, normalizedPattern,
  1549. cursor - 1);
  1550. }
  1551. /**
  1552. * Determines if there is any supplementary character or unpaired
  1553. * surrogate in the specified range.
  1554. */
  1555. private boolean findSupplementary(int start, int end) {
  1556. for (int i = start; i < end; i++) {
  1557. if (isSupplementary(temp[i]))
  1558. return true;
  1559. }
  1560. return false;
  1561. }
  1562. /**
  1563. * Determines if the specified code point is a supplementary
  1564. * character or unpaired surrogate.
  1565. */
  1566. private static final boolean isSupplementary(int ch) {
  1567. return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
  1568. }
  1569. /**
  1570. * The following methods handle the main parsing. They are sorted
  1571. * according to their precedence order, the lowest one first.
  1572. */
  1573. /**
  1574. * The expression is parsed with branch nodes added for alternations.
  1575. * This may be called recursively to parse sub expressions that may
  1576. * contain alternations.
  1577. */
  1578. private Node expr(Node end) {
  1579. Node prev = null;
  1580. for (;;) {
  1581. Node node = sequence(end);
  1582. if (prev == null) {
  1583. prev = node;
  1584. } else {
  1585. prev = new Branch(prev, node);
  1586. }
  1587. if (peek() != '|') {
  1588. return prev;
  1589. }
  1590. next();
  1591. }
  1592. }
  1593. /**
  1594. * Parsing of sequences between alternations.
  1595. */
  1596. private Node sequence(Node end) {
  1597. Node head = null;
  1598. Node tail = null;
  1599. Node node = null;
  1600. int i, j, ch;
  1601. LOOP:
  1602. for (;;) {
  1603. ch = peek();
  1604. switch (ch) {
  1605. case '(':
  1606. // Because group handles its own closure,
  1607. // we need to treat it differently
  1608. node = group0();
  1609. // Check for comment or flag group
  1610. if (node == null)
  1611. continue;
  1612. if (head == null)
  1613. head = node;
  1614. else
  1615. tail.next = node;
  1616. // Double return: Tail was returned in root
  1617. tail = root;
  1618. continue;
  1619. case '[':
  1620. node = clazz(true);
  1621. break;
  1622. case '\\':
  1623. ch = nextEscaped();
  1624. if (ch == 'p' || ch == 'P') {
  1625. boolean comp = (ch == 'P');
  1626. boolean oneLetter = true;
  1627. ch = next(); // Consume { if present
  1628. if (ch != '{') {
  1629. unread();
  1630. } else {
  1631. oneLetter = false;
  1632. }
  1633. node = family(comp, oneLetter);
  1634. } else {
  1635. unread();
  1636. node = atom();
  1637. }
  1638. break;
  1639. case '^':
  1640. next();
  1641. if (has(MULTILINE)) {
  1642. if (has(UNIX_LINES))
  1643. node = new UnixCaret();
  1644. else
  1645. node = new Caret();
  1646. } else {
  1647. node = new Begin();
  1648. }
  1649. break;
  1650. case '$':
  1651. next();
  1652. if (has(UNIX_LINES))
  1653. node = new UnixDollar(has(MULTILINE));
  1654. else
  1655. node = new Dollar(has(MULTILINE));
  1656. break;
  1657. case '.':
  1658. next();
  1659. if (has(DOTALL)) {
  1660. node = new All();
  1661. } else {
  1662. if (has(UNIX_LINES))
  1663. node = new UnixDot();
  1664. else {
  1665. node = new Dot();
  1666. }
  1667. }
  1668. break;
  1669. case '|':
  1670. case ')':
  1671. break LOOP;
  1672. case ']': // Now interpreting dangling ] and } as literals
  1673. case '}':
  1674. node = atom();
  1675. break;
  1676. case '?':
  1677. case '*':
  1678. case '+':
  1679. next();
  1680. return error("Dangling meta character '" + ((char)ch) + "'");
  1681. case 0:
  1682. if (cursor >= patternLength) {
  1683. break LOOP;
  1684. }
  1685. // Fall through
  1686. default:
  1687. node = atom();
  1688. break;
  1689. }
  1690. node = closure(node);
  1691. if (head == null) {
  1692. head = tail = node;
  1693. } else {
  1694. tail.next = node;
  1695. tail = node;
  1696. }
  1697. }
  1698. if (head == null) {
  1699. return end;
  1700. }
  1701. tail.next = end;
  1702. return head;
  1703. }
  1704. /**
  1705. * Parse and add a new Single or Slice.
  1706. */
  1707. private Node atom() {
  1708. int first = 0;
  1709. int prev = -1;
  1710. boolean hasSupplementary = false;
  1711. int ch = peek();
  1712. for (;;) {
  1713. switch (ch) {
  1714. case '*':
  1715. case '+':
  1716. case '?':
  1717. case '{':
  1718. if (first > 1) {
  1719. cursor = prev; // Unwind one character
  1720. first--;
  1721. }
  1722. break;
  1723. case '$':
  1724. case '.':
  1725. case '^':
  1726. case '(':
  1727. case '[':
  1728. case '|':
  1729. case ')':
  1730. break;
  1731. case '\\':
  1732. ch = nextEscaped();
  1733. if (ch == 'p' || ch == 'P') { // Property
  1734. if (first > 0) { // Slice is waiting; handle it first
  1735. unread();
  1736. break;
  1737. } else { // No slice; just return the family node
  1738. if (ch == 'p' || ch == 'P') {
  1739. boolean comp = (ch == 'P');
  1740. boolean oneLetter = true;
  1741. ch = next(); // Consume { if present
  1742. if (ch != '{')
  1743. unread();
  1744. else
  1745. oneLetter = false;
  1746. return family(comp, oneLetter);
  1747. }
  1748. }
  1749. break;
  1750. }
  1751. unread();
  1752. prev = cursor;
  1753. ch = escape(false, first == 0);
  1754. if (ch >= 0) {
  1755. append(ch, first);
  1756. first++;
  1757. if (isSupplementary(ch)) {
  1758. hasSupplementary = true;
  1759. }
  1760. ch = peek();
  1761. continue;
  1762. } else if (first == 0) {
  1763. return root;
  1764. }
  1765. // Unwind meta escape sequence
  1766. cursor = prev;
  1767. break;
  1768. case 0:
  1769. if (cursor >= patternLength) {
  1770. break;
  1771. }
  1772. // Fall through
  1773. default:
  1774. prev = cursor;
  1775. append(ch, first);
  1776. first++;
  1777. if (isSupplementary(ch)) {
  1778. hasSupplementary = true;
  1779. }
  1780. ch = next();
  1781. continue;
  1782. }
  1783. break;
  1784. }
  1785. if (first == 1) {
  1786. return newSingle(buffer[0]);
  1787. } else {
  1788. return newSlice(buffer, first, hasSupplementary);
  1789. }
  1790. }
  1791. private void append(int ch, int len) {
  1792. if (len >= buffer.length) {
  1793. int[] tmp = new int[len+len];
  1794. System.arraycopy(buffer, 0, tmp, 0, len);
  1795. buffer = tmp;
  1796. }
  1797. buffer[len] = ch;
  1798. }
  1799. /**
  1800. * Parses a backref greedily, taking as many numbers as it
  1801. * can. The first digit is always treated as a backref, but
  1802. * multi digit numbers are only treated as a backref if at
  1803. * least that many backrefs exist at this point in the regex.
  1804. */
  1805. private Node ref(int refNum) {
  1806. boolean done = false;
  1807. while(!done) {
  1808. int ch = peek();
  1809. switch(ch) {
  1810. case '0':
  1811. case '1':
  1812. case '2':
  1813. case '3':
  1814. case '4':
  1815. case '5':
  1816. case '6':
  1817. case '7':
  1818. case '8':
  1819. case '9':
  1820. int newRefNum = (refNum * 10) + (ch - '0');
  1821. // Add another number if it doesn't make a group
  1822. // that doesn't exist
  1823. if (capturingGroupCount - 1 < newRefNum) {
  1824. done = true;
  1825. break;
  1826. }
  1827. refNum = newRefNum;
  1828. read();
  1829. break;
  1830. default:
  1831. done = true;
  1832. break;
  1833. }
  1834. }
  1835. if (has(CASE_INSENSITIVE) || has(UNICODE_CASE))
  1836. return new CIBackRef(refNum);
  1837. else
  1838. return new BackRef(refNum);
  1839. }
  1840. /**
  1841. * Parses an escape sequence to determine the actual value that needs
  1842. * to be matched.
  1843. * If -1 is returned and create was true a new object was added to the tree
  1844. * to handle the escape sequence.
  1845. * If the returned value is greater than zero, it is the value that
  1846. * matches the escape sequence.
  1847. */
  1848. private int escape(boolean inclass, boolean create) {
  1849. int ch = skip();
  1850. switch (ch) {
  1851. case '0':
  1852. return o();
  1853. case '1':
  1854. case '2':
  1855. case '3':
  1856. case '4':
  1857. case '5':
  1858. case '6':
  1859. case '7':
  1860. case '8':
  1861. case '9':
  1862. if (inclass) break;
  1863. if (capturingGroupCount < (ch - '0'))
  1864. error("No such group yet exists at this point in the pattern");
  1865. if (create) {
  1866. root = ref((ch - '0'));
  1867. }
  1868. return -1;
  1869. case 'A':
  1870. if (inclass) break;
  1871. if (create) root = new Begin();
  1872. return -1;
  1873. case 'B':
  1874. if (inclass) break;
  1875. if (create) root = new Bound(Bound.NONE);
  1876. return -1;
  1877. case 'C':
  1878. break;
  1879. case 'D':
  1880. if (create) root = new NotCtype(ASCII.DIGIT);
  1881. return -1;
  1882. case 'E':
  1883. case 'F':
  1884. break;
  1885. case 'G':
  1886. if (inclass) break;
  1887. if (create) root = new LastMatch();
  1888. return -1;
  1889. case 'H':
  1890. case 'I':
  1891. case 'J':
  1892. case 'K':
  1893. case 'L':
  1894. case 'M':
  1895. case 'N':
  1896. case 'O':
  1897. case 'P':
  1898. break;
  1899. case 'Q':
  1900. if (create) {
  1901. // Disable metacharacters. We will return a slice
  1902. // up to the next \E
  1903. int i = cursor;
  1904. int c;
  1905. while ((c = readEscaped()) != 0) {
  1906. if (c == '\\') {
  1907. c = readEscaped();
  1908. if (c == 'E' || c == 0)
  1909. break;
  1910. else
  1911. unread();
  1912. }
  1913. }
  1914. int j = cursor-1;
  1915. if (c == 'E')
  1916. j--;
  1917. else
  1918. unread();
  1919. boolean hasSupplementary = false;
  1920. for (int x = i; x<j; x++) {
  1921. c = temp[x];
  1922. append(c, x-i);
  1923. if (isSupplementary(c)) {
  1924. hasSupplementary = true;
  1925. }
  1926. }
  1927. root = newSlice(buffer, j-i, hasSupplementary);
  1928. }
  1929. return -1;
  1930. case 'R':
  1931. break;
  1932. case 'S':
  1933. if (create) root = new NotCtype(ASCII.SPACE);
  1934. return -1;
  1935. case 'T':
  1936. case 'U':
  1937. case 'V':
  1938. break;
  1939. case 'W':
  1940. if (create) root = new NotCtype(ASCII.WORD);
  1941. return -1;
  1942. case 'X':
  1943. case 'Y':
  1944. break;
  1945. case 'Z':
  1946. if (inclass) break;
  1947. if (create) {
  1948. if (has(UNIX_LINES))
  1949. root = new UnixDollar(false);
  1950. else
  1951. root = new Dollar(false);
  1952. }
  1953. return -1;
  1954. case 'a':
  1955. return '\007';
  1956. case 'b':
  1957. if (inclass) break;
  1958. if (create) root = new Bound(Bound.BOTH);
  1959. return -1;
  1960. case 'c':
  1961. return c();
  1962. case 'd':
  1963. if (create) root = new Ctype(ASCII.DIGIT);
  1964. return -1;
  1965. case 'e':
  1966. return '\033';
  1967. case 'f':
  1968. return '\f';
  1969. case 'g':
  1970. case 'h':
  1971. case 'i':
  1972. case 'j':
  1973. case 'k':
  1974. case 'l':
  1975. case 'm':
  1976. break;
  1977. case 'n':
  1978. return '\n';
  1979. case 'o':
  1980. case 'p':
  1981. case 'q':
  1982. break;
  1983. case 'r':
  1984. return '\r';
  1985. case 's':
  1986. if (create) root = new Ctype(ASCII.SPACE);
  1987. return -1;
  1988. case 't':
  1989. return '\t';
  1990. case 'u':
  1991. return u();
  1992. case 'v':
  1993. return '\013';
  1994. case 'w':
  1995. if (create) root = new Ctype(ASCII.WORD);
  1996. return -1;
  1997. case 'x':
  1998. return x();
  1999. case 'y':
  2000. break;
  2001. case 'z':
  2002. if (inclass) break;
  2003. if (create) root = new End();
  2004. return -1;
  2005. default:
  2006. return ch;
  2007. }
  2008. error("Illegal/unsupported escape squence");
  2009. return -2;
  2010. }
  2011. /**
  2012. * Parse a character class, and return the node that matches it.
  2013. *
  2014. * Consumes a ] on the way out if consume is true. Usually consume
  2015. * is true except for the case of [abc&&def] where def is a separate
  2016. * right hand node with "understood" brackets.
  2017. */
  2018. private Node clazz(boolean consume) {
  2019. Node prev = null;
  2020. Node node = null;
  2021. BitClass bits = new BitClass(false);
  2022. boolean include = true;
  2023. boolean firstInClass = true;
  2024. int ch = next();
  2025. for (;;) {
  2026. switch (ch) {
  2027. case '^':
  2028. // Negates if first char in a class, otherwise literal
  2029. if (firstInClass) {
  2030. if (temp[cursor-1] != '[')
  2031. break;
  2032. ch = next();
  2033. include = !include;
  2034. continue;
  2035. } else {
  2036. // ^ not first in class, treat as literal
  2037. break;
  2038. }
  2039. case '[':
  2040. firstInClass = false;
  2041. node = clazz(true);
  2042. if (prev == null)
  2043. prev = node;
  2044. else
  2045. prev = new Add(prev, node);
  2046. ch = peek();
  2047. continue;
  2048. case '&':
  2049. firstInClass = false;
  2050. ch = next();
  2051. if (ch == '&') {
  2052. ch = next();
  2053. Node rightNode = null;
  2054. while (ch != ']' && ch != '&') {
  2055. if (ch == '[') {
  2056. if (rightNode == null)
  2057. rightNode = clazz(true);
  2058. else
  2059. rightNode = new Add(rightNode, clazz(true));
  2060. } else { // abc&&def
  2061. unread();
  2062. rightNode = clazz(false);
  2063. }
  2064. ch = peek();
  2065. }
  2066. if (rightNode != null)
  2067. node = rightNode;
  2068. if (prev == null) {
  2069. if (rightNode == null)
  2070. return error("Bad class syntax");
  2071. else
  2072. prev = rightNode;
  2073. } else {
  2074. prev = new Both(prev, node);
  2075. }
  2076. } else {
  2077. // treat as a literal &
  2078. unread();
  2079. break;
  2080. }
  2081. continue;
  2082. case 0:
  2083. firstInClass = false;
  2084. if (cursor >= patternLength)
  2085. return error("Unclosed character class");
  2086. break;
  2087. case ']':
  2088. firstInClass = false;
  2089. if (prev != null) {
  2090. if (consume)
  2091. next();
  2092. return prev;
  2093. }
  2094. break;
  2095. default:
  2096. firstInClass = false;
  2097. break;
  2098. }
  2099. node = range(bits);
  2100. if (include) {
  2101. if (prev == null) {
  2102. prev = node;
  2103. } else {
  2104. if (prev != node)
  2105. prev = new Add(prev, node);
  2106. }
  2107. } else {
  2108. if (prev == null) {
  2109. prev = node.dup(true); // Complement
  2110. } else {
  2111. if (prev != node)
  2112. prev = new Sub(prev, node);
  2113. }
  2114. }
  2115. ch = peek();
  2116. }
  2117. }
  2118. /**
  2119. * Parse a single character or a character range in a character class
  2120. * and return its representative node.
  2121. */
  2122. private Node range(BitClass bits) {
  2123. int ch = peek();
  2124. if (ch == '\\') {
  2125. ch = nextEscaped();
  2126. if (ch == 'p' || ch == 'P') { // A property
  2127. boolean comp = (ch == 'P');
  2128. boolean oneLetter = true;
  2129. // Consume { if present
  2130. ch = next();
  2131. if (ch != '{')
  2132. unread();
  2133. else
  2134. oneLetter = false;
  2135. return family(comp, oneLetter);
  2136. } else { // ordinary escape
  2137. unread();
  2138. ch = escape(true, true);
  2139. if (ch == -1)
  2140. return root;
  2141. }
  2142. } else {
  2143. ch = single();
  2144. }
  2145. if (ch >= 0) {
  2146. if (peek() == '-') {
  2147. int endRange = temp[cursor+1];
  2148. if (endRange == '[') {
  2149. if (ch < 256)
  2150. return bits.add(ch, flags());
  2151. return newSingle(ch);
  2152. }
  2153. if (endRange != ']') {
  2154. next();
  2155. int m = single();
  2156. if (m < ch)
  2157. return error("Illegal character range");
  2158. if (has(CASE_INSENSITIVE) || has(UNICODE_CASE))
  2159. return new CIRange(ch, m);
  2160. else
  2161. return new Range(ch, m);
  2162. }
  2163. }
  2164. if (ch < 256)
  2165. return bits.add(ch, flags());
  2166. return newSingle(ch);
  2167. }
  2168. return error("Unexpected character '"+((char)ch)+"'");
  2169. }
  2170. private int single() {
  2171. int ch = peek();
  2172. switch (ch) {
  2173. case '\\':
  2174. return escape(true, false);
  2175. default:
  2176. next();
  2177. return ch;
  2178. }
  2179. }
  2180. /**
  2181. * Parses a Unicode character family and returns its representative node.
  2182. * Reference to an unknown character family results in a list of supported
  2183. * families in the error.
  2184. */
  2185. private Node family(boolean not, boolean singleLetter) {
  2186. next();
  2187. String name;
  2188. if (singleLetter) {
  2189. int c = temp[cursor];
  2190. if (!Character.isSupplementaryCodePoint(c)) {
  2191. name = String.valueOf((char)c);
  2192. } else {
  2193. name = new String(temp, cursor, 1);
  2194. }
  2195. name = name.intern();
  2196. read();
  2197. } else {
  2198. int i = cursor;
  2199. mark('}');
  2200. while(read() != '}') {
  2201. }
  2202. mark('\000');
  2203. int j = cursor;
  2204. if (j > patternLength)
  2205. return error("Unclosed character family");
  2206. if (i + 1 >= j)
  2207. return error("Empty character family");
  2208. name = new String(temp, i, j-i-1).intern();
  2209. }
  2210. if (name.startsWith("In")) {
  2211. name = name.substring(2, name.length()).intern();
  2212. return retrieveFamilyNode(name, not);
  2213. }
  2214. if (name.startsWith("Is"))
  2215. name = name.substring(2, name.length()).intern();
  2216. return retrieveCategoryNode(name).dup(not);
  2217. }
  2218. private Node retrieveFamilyNode(String name, boolean not) {
  2219. if (name == null) {
  2220. return familyError("", "Null character family.");
  2221. }
  2222. Node n = null;
  2223. try {
  2224. Character.UnicodeBlock block = Character.UnicodeBlock.forName(name);
  2225. n = new UBlock(block, not);
  2226. } catch (IllegalArgumentException iae) {
  2227. return familyError(name, "Unknown character family {");
  2228. }
  2229. return n;
  2230. }
  2231. private Node retrieveCategoryNode(String name) {
  2232. Node n = (Node)categoryNames.cMap.get(name);
  2233. if (n != null)
  2234. return n;
  2235. return familyError(name, "Unknown character category {");
  2236. }
  2237. private Node familyError(String name, String type) {
  2238. StringBuilder sb = new StringBuilder();
  2239. sb.append(type);
  2240. sb.append(name);
  2241. sb.append("}");
  2242. name = sb.toString();
  2243. return error(name);
  2244. }
  2245. /**
  2246. * Parses a group and returns the head node of a set of nodes that process
  2247. * the group. Sometimes a double return system is used where the tail is
  2248. * returned in root.
  2249. */
  2250. private Node group0() {
  2251. boolean capturingGroup = false;
  2252. Node head = null;
  2253. Node tail = null;
  2254. int save = flags;
  2255. root = null;
  2256. int ch = next();
  2257. if (ch == '?') {
  2258. ch = skip();
  2259. switch (ch) {
  2260. case ':': // (?:xxx) pure group
  2261. head = createGroup(true);
  2262. tail = root;
  2263. head.next = expr(tail);
  2264. break;
  2265. case '=': // (?=xxx) and (?!xxx) lookahead
  2266. case '!':
  2267. head = createGroup(true);
  2268. tail = root;
  2269. head.next = expr(tail);
  2270. if (ch == '=') {
  2271. head = tail = new Pos(head);
  2272. } else {
  2273. head = tail = new Neg(head);
  2274. }
  2275. break;
  2276. case '>': // (?>xxx) independent group
  2277. head = createGroup(true);
  2278. tail = root;
  2279. head.next = expr(tail);
  2280. head = tail = new Ques(head, INDEPENDENT);
  2281. break;
  2282. case '<': // (?<xxx) look behind
  2283. ch = read();
  2284. int start = cursor;
  2285. head = createGroup(true);
  2286. tail = root;
  2287. head.next = expr(tail);
  2288. TreeInfo info = new TreeInfo();
  2289. head.study(info);
  2290. if (info.maxValid == false) {
  2291. return error("Look-behind group does not have "
  2292. + "an obvious maximum length");
  2293. }
  2294. boolean hasSupplementary = findSupplementary(start, patternLength);
  2295. if (ch == '=') {
  2296. head = tail = (hasSupplementary ?
  2297. new BehindS(head, info.maxLength,
  2298. info.minLength) :
  2299. new Behind(head, info.maxLength,
  2300. info.minLength));
  2301. } else if (ch == '!') {
  2302. head = tail = (hasSupplementary ?
  2303. new NotBehindS(head, info.maxLength,
  2304. info.minLength) :
  2305. new NotBehind(head, info.maxLength,
  2306. info.minLength));
  2307. } else {
  2308. error("Unknown look-behind group");
  2309. }
  2310. break;
  2311. case '$':
  2312. case '@':
  2313. return error("Unknown group type");
  2314. default: // (?xxx:) inlined match flags
  2315. unread();
  2316. addFlag();
  2317. ch = read();
  2318. if (ch == ')') {
  2319. return null; // Inline modifier only
  2320. }
  2321. if (ch != ':') {
  2322. return error("Unknown inline modifier");
  2323. }
  2324. head = createGroup(true);
  2325. tail = root;
  2326. head.next = expr(tail);
  2327. break;
  2328. }
  2329. } else { // (xxx) a regular group
  2330. capturingGroup = true;
  2331. head = createGroup(false);
  2332. tail = root;
  2333. head.next = expr(tail);
  2334. }
  2335. accept(')', "Unclosed group");
  2336. flags = save;
  2337. // Check for quantifiers
  2338. Node node = closure(head);
  2339. if (node == head) { // No closure
  2340. root = tail;
  2341. return node; // Dual return
  2342. }
  2343. if (head == tail) { // Zero length assertion
  2344. root = node;
  2345. return node; // Dual return
  2346. }
  2347. if (node instanceof Ques) {
  2348. Ques ques = (Ques) node;
  2349. if (ques.type == POSSESSIVE) {
  2350. root = node;
  2351. return node;
  2352. }
  2353. // Dummy node to connect branch
  2354. tail.next = new Dummy();
  2355. tail = tail.next;
  2356. if (ques.type == GREEDY) {
  2357. head = new Branch(head, tail);
  2358. } else { // Reluctant quantifier
  2359. head = new Branch(tail, head);
  2360. }
  2361. root = tail;
  2362. return head;
  2363. } else if (node instanceof Curly) {
  2364. Curly curly = (Curly) node;
  2365. if (curly.type == POSSESSIVE) {
  2366. root = node;
  2367. return node;
  2368. }
  2369. // Discover if the group is deterministic
  2370. TreeInfo info = new TreeInfo();
  2371. if (head.study(info)) { // Deterministic
  2372. GroupTail temp = (GroupTail) tail;
  2373. head = root = new GroupCurly(head.next, curly.cmin,
  2374. curly.cmax, curly.type,
  2375. ((GroupTail)tail).localIndex,
  2376. ((GroupTail)tail).groupIndex,
  2377. capturingGroup);
  2378. return head;
  2379. } else { // Non-deterministic
  2380. int temp = ((GroupHead) head).localIndex;
  2381. Loop loop;
  2382. if (curly.type == GREEDY)
  2383. loop = new Loop(this.localCount, temp);
  2384. else // Reluctant Curly
  2385. loop = new LazyLoop(this.localCount, temp);
  2386. Prolog prolog = new Prolog(loop);
  2387. this.localCount += 1;
  2388. loop.cmin = curly.cmin;
  2389. loop.cmax = curly.cmax;
  2390. loop.body = head;
  2391. tail.next = loop;
  2392. root = loop;
  2393. return prolog; // Dual return
  2394. }
  2395. } else if (node instanceof First) {
  2396. root = node;
  2397. return node;
  2398. }
  2399. return error("Internal logic error");
  2400. }
  2401. /**
  2402. * Create group head and tail nodes using double return. If the group is
  2403. * created with anonymous true then it is a pure group and should not
  2404. * affect group counting.
  2405. */
  2406. private Node createGroup(boolean anonymous) {
  2407. int localIndex = localCount++;
  2408. int groupIndex = 0;
  2409. if (!anonymous)
  2410. groupIndex = capturingGroupCount++;
  2411. GroupHead head = new GroupHead(localIndex);
  2412. root = new GroupTail(localIndex, groupIndex);
  2413. if (!anonymous && groupIndex < 10)
  2414. groupNodes[groupIndex] = head;
  2415. return head;
  2416. }
  2417. /**
  2418. * Parses inlined match flags and set them appropriately.
  2419. */
  2420. private void addFlag() {
  2421. int ch = peek();
  2422. for (;;) {
  2423. switch (ch) {
  2424. case 'i':
  2425. flags |= CASE_INSENSITIVE;
  2426. break;
  2427. case 'm':
  2428. flags |= MULTILINE;
  2429. break;
  2430. case 's':
  2431. flags |= DOTALL;
  2432. break;
  2433. case 'd':
  2434. flags |= UNIX_LINES;
  2435. break;
  2436. case 'u':
  2437. flags |= UNICODE_CASE;
  2438. break;
  2439. case 'c':
  2440. flags |= CANON_EQ;
  2441. break;
  2442. case 'x':
  2443. flags |= COMMENTS;
  2444. break;
  2445. case '-': // subFlag then fall through
  2446. ch = next();
  2447. subFlag();
  2448. default:
  2449. return;
  2450. }
  2451. ch = next();
  2452. }
  2453. }
  2454. /**
  2455. * Parses the second part of inlined match flags and turns off
  2456. * flags appropriately.
  2457. */
  2458. private void subFlag() {
  2459. int ch = peek();
  2460. for (;;) {
  2461. switch (ch) {
  2462. case 'i':
  2463. flags &= ~CASE_INSENSITIVE;
  2464. break;
  2465. case 'm':
  2466. flags &= ~MULTILINE;
  2467. break;
  2468. case 's':
  2469. flags &= ~DOTALL;
  2470. break;
  2471. case 'd':
  2472. flags &= ~UNIX_LINES;
  2473. break;
  2474. case 'u':
  2475. flags &= ~UNICODE_CASE;
  2476. break;
  2477. case 'c':
  2478. flags &= ~CANON_EQ;
  2479. break;
  2480. case 'x':
  2481. flags &= ~COMMENTS;
  2482. break;
  2483. default:
  2484. return;
  2485. }
  2486. ch = next();
  2487. }
  2488. }
  2489. static final int MAX_REPS = 0x7FFFFFFF;
  2490. static final int GREEDY = 0;
  2491. static final int LAZY = 1;
  2492. static final int POSSESSIVE = 2;
  2493. static final int INDEPENDENT = 3;
  2494. /**
  2495. * Processes repetition. If the next character peeked is a quantifier
  2496. * then new nodes must be appended to handle the repetition.
  2497. * Prev could be a single or a group, so it could be a chain of nodes.
  2498. */
  2499. private Node closure(Node prev) {
  2500. Node atom;
  2501. int ch = peek();
  2502. switch (ch) {
  2503. case '?':
  2504. ch = next();
  2505. if (ch == '?') {
  2506. next();
  2507. return new Ques(prev, LAZY);
  2508. } else if (ch == '+') {
  2509. next();
  2510. return new Ques(prev, POSSESSIVE);
  2511. }
  2512. return new Ques(prev, GREEDY);
  2513. case '*':
  2514. ch = next();
  2515. if (ch == '?') {
  2516. next();
  2517. return new Curly(prev, 0, MAX_REPS, LAZY);
  2518. } else if (ch == '+') {
  2519. next();
  2520. return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
  2521. }
  2522. return new Curly(prev, 0, MAX_REPS, GREEDY);
  2523. case '+':
  2524. ch = next();
  2525. if (ch == '?') {
  2526. next();
  2527. return new Curly(prev, 1, MAX_REPS, LAZY);
  2528. } else if (ch == '+') {
  2529. next();
  2530. return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
  2531. }
  2532. return new Curly(prev, 1, MAX_REPS, GREEDY);
  2533. case '{':
  2534. ch = temp[cursor+1];
  2535. if (ASCII.isDigit(ch)) {
  2536. skip();
  2537. int cmin = 0;
  2538. do {
  2539. cmin = cmin * 10 + (ch - '0');
  2540. } while (ASCII.isDigit(ch = read()));
  2541. int cmax = cmin;
  2542. if (ch == ',') {
  2543. ch = read();
  2544. cmax = MAX_REPS;
  2545. if (ch != '}') {
  2546. cmax = 0;
  2547. while (ASCII.isDigit(ch)) {
  2548. cmax = cmax * 10 + (ch - '0');
  2549. ch = read();
  2550. }
  2551. }
  2552. }
  2553. if (ch != '}')
  2554. return error("Unclosed counted closure");
  2555. if (((cmin) | (cmax) | (cmax - cmin)) < 0)
  2556. return error("Illegal repetition range");
  2557. Curly curly;
  2558. ch = peek();
  2559. if (ch == '?') {
  2560. next();
  2561. curly = new Curly(prev, cmin, cmax, LAZY);
  2562. } else if (ch == '+') {
  2563. next();
  2564. curly = new Curly(prev, cmin, cmax, POSSESSIVE);
  2565. } else {
  2566. curly = new Curly(prev, cmin, cmax, GREEDY);
  2567. }
  2568. return curly;
  2569. } else {
  2570. error("Illegal repetition");
  2571. }
  2572. return prev;
  2573. default:
  2574. return prev;
  2575. }
  2576. }
  2577. /**
  2578. * Utility method for parsing control escape sequences.
  2579. */
  2580. private int c() {
  2581. if (cursor < patternLength) {
  2582. return read() ^ 64;
  2583. }
  2584. error("Illegal control escape sequence");
  2585. return -1;
  2586. }
  2587. /**
  2588. * Utility method for parsing octal escape sequences.
  2589. */
  2590. private int o() {
  2591. int n = read();
  2592. if (((n-'0')|('7'-n)) >= 0) {
  2593. int m = read();
  2594. if (((m-'0')|('7'-m)) >= 0) {
  2595. int o = read();
  2596. if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
  2597. return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
  2598. }
  2599. unread();
  2600. return (n - '0') * 8 + (m - '0');
  2601. }
  2602. unread();
  2603. return (n - '0');
  2604. }
  2605. error("Illegal octal escape sequence");
  2606. return -1;
  2607. }
  2608. /**
  2609. * Utility method for parsing hexadecimal escape sequences.
  2610. */
  2611. private int x() {
  2612. int n = read();
  2613. if (ASCII.isHexDigit(n)) {
  2614. int m = read();
  2615. if (ASCII.isHexDigit(m)) {
  2616. return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
  2617. }
  2618. }
  2619. error("Illegal hexadecimal escape sequence");
  2620. return -1;
  2621. }
  2622. /**
  2623. * Utility method for parsing unicode escape sequences.
  2624. */
  2625. private int u() {
  2626. int n = 0;
  2627. for (int i = 0; i < 4; i++) {
  2628. int ch = read();
  2629. if (!ASCII.isHexDigit(ch)) {
  2630. error("Illegal Unicode escape sequence");
  2631. }
  2632. n = n * 16 + ASCII.toDigit(ch);
  2633. }
  2634. return n;
  2635. }
  2636. //
  2637. // Utility methods for code point support
  2638. //
  2639. /**
  2640. * Tests a surrogate value.
  2641. */
  2642. private static final boolean isSurrogate(int c) {
  2643. return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
  2644. }
  2645. private static final int countChars(CharSequence seq, int index,
  2646. int lengthInCodePoints) {
  2647. // optimization
  2648. if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
  2649. assert (index >= 0 && index < seq.length());
  2650. return 1;
  2651. }
  2652. int length = seq.length();
  2653. int x = index;
  2654. if (lengthInCodePoints >= 0) {
  2655. assert (index >= 0 && index < length);
  2656. for (int i = 0; x < length && i < lengthInCodePoints; i++) {
  2657. if (Character.isHighSurrogate(seq.charAt(x++))) {
  2658. if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
  2659. x++;
  2660. }
  2661. }
  2662. }
  2663. return x - index;
  2664. }
  2665. assert (index >= 0 && index <= length);
  2666. if (index == 0) {
  2667. return 0;
  2668. }
  2669. int len = -lengthInCodePoints;
  2670. for (int i = 0; x > 0 && i < len; i++) {
  2671. if (Character.isLowSurrogate(seq.charAt(--x))) {
  2672. if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
  2673. x--;
  2674. }
  2675. }
  2676. }
  2677. return index - x;
  2678. }
  2679. private static final int countCodePoints(CharSequence seq) {
  2680. int length = seq.length();
  2681. int n = 0;
  2682. for (int i = 0; i < length; ) {
  2683. n++;
  2684. if (Character.isHighSurrogate(seq.charAt(i++))) {
  2685. if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
  2686. i++;
  2687. }
  2688. }
  2689. }
  2690. return n;
  2691. }
  2692. /**
  2693. * Creates a bit vector for matching Latin-1 values. A normal BitClass
  2694. * never matches values above Latin-1, and a complemented BitClass always
  2695. * matches values above Latin-1.
  2696. */
  2697. static final class BitClass extends Node {
  2698. boolean[] bits = new boolean[256];
  2699. boolean complementMe = false;
  2700. BitClass(boolean not) {
  2701. complementMe = not;
  2702. }
  2703. BitClass(boolean[] newBits, boolean not) {
  2704. complementMe = not;
  2705. bits = newBits;
  2706. }
  2707. Node add(int c, int f) {
  2708. if ((f & CASE_INSENSITIVE) == 0) {
  2709. bits[c] = true;
  2710. } else if (ASCII.isAscii(c)) {
  2711. bits[ASCII.toUpper(c)] = true;
  2712. bits[ASCII.toLower(c)] = true;
  2713. } else {
  2714. bits[Character.toLowerCase((char)c)] = true;
  2715. bits[Character.toUpperCase((char)c)] = true;
  2716. }
  2717. return this;
  2718. }
  2719. Node dup(boolean not) {
  2720. return new BitClass(bits, not);
  2721. }
  2722. boolean match(Matcher matcher, int i, CharSequence seq) {
  2723. if (i >= matcher.to) {
  2724. matcher.hitEnd = true;
  2725. return false;
  2726. }
  2727. int c = Character.codePointAt(seq, i);
  2728. boolean charMatches =
  2729. (c > 255) ? complementMe : (bits[c] ^ complementMe);
  2730. return charMatches && next.match(matcher, i+Character.charCount(c), seq);
  2731. }
  2732. boolean study(TreeInfo info) {
  2733. info.minLength++;
  2734. info.maxLength++;
  2735. return next.study(info);
  2736. }
  2737. }
  2738. /**
  2739. * Utility method for creating a single character matcher.
  2740. */
  2741. private Node newSingle(int ch) {
  2742. int f = flags;
  2743. if ((f & (CASE_INSENSITIVE|UNICODE_CASE)) == 0) {
  2744. return new Single(ch);
  2745. }
  2746. if ((f & UNICODE_CASE) == 0) {
  2747. return new SingleA(ch);
  2748. }
  2749. return new SingleU(ch);
  2750. }
  2751. /**
  2752. * Utility method for creating a string slice matcher.
  2753. */
  2754. private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
  2755. int[] tmp = new int[count];
  2756. int i = flags;
  2757. if ((i & (CASE_INSENSITIVE|UNICODE_CASE)) == 0) {
  2758. for (i = 0; i < count; i++) {
  2759. tmp[i] = buf[i];
  2760. }
  2761. return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
  2762. } else if ((i & UNICODE_CASE) == 0) {
  2763. for (i = 0; i < count; i++) {
  2764. tmp[i] = (char)ASCII.toLower(buf[i]);
  2765. }
  2766. return new SliceA(tmp);
  2767. } else {
  2768. for (i = 0; i < count; i++) {
  2769. int c = buf[i];
  2770. c = Character.toUpperCase(c);
  2771. c = Character.toLowerCase(c);
  2772. tmp[i] = c;
  2773. }
  2774. return new SliceU(tmp);
  2775. }
  2776. }
  2777. /**
  2778. * The following classes are the building components of the object
  2779. * tree that represents a compiled regular expression. The object tree
  2780. * is made of individual elements that handle constructs in the Pattern.
  2781. * Each type of object knows how to match its equivalent construct with
  2782. * the match() method.
  2783. */
  2784. /**
  2785. * Base class for all node classes. Subclasses should override the match()
  2786. * method as appropriate. This class is an accepting node, so its match()
  2787. * always returns true.
  2788. */
  2789. static class Node extends Object {
  2790. Node next;
  2791. Node() {
  2792. next = Pattern.accept;
  2793. }
  2794. Node dup(boolean not) {
  2795. if (not) {
  2796. return new Not(this);
  2797. } else {
  2798. throw new RuntimeException("internal error in Node dup()");
  2799. }
  2800. }
  2801. /**
  2802. * This method implements the classic accept node.
  2803. */
  2804. boolean match(Matcher matcher, int i, CharSequence seq) {
  2805. matcher.last = i;
  2806. matcher.groups[0] = matcher.first;
  2807. matcher.groups[1] = matcher.last;
  2808. return true;
  2809. }
  2810. /**
  2811. * This method is good for all zero length assertions.
  2812. */
  2813. boolean study(TreeInfo info) {
  2814. if (next != null) {
  2815. return next.study(info);
  2816. } else {
  2817. return info.deterministic;
  2818. }
  2819. }
  2820. }
  2821. static class LastNode extends Node {
  2822. /**
  2823. * This method implements the classic accept node with
  2824. * the addition of a check to see if the match occurred
  2825. * using all of the input.
  2826. */
  2827. boolean match(Matcher matcher, int i, CharSequence seq) {
  2828. if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
  2829. return false;
  2830. matcher.last = i;
  2831. matcher.groups[0] = matcher.first;
  2832. matcher.groups[1] = matcher.last;
  2833. return true;
  2834. }
  2835. }
  2836. /**
  2837. * Dummy node to assist in connecting branches.
  2838. */
  2839. static class Dummy extends Node {
  2840. boolean match(Matcher matcher, int i, CharSequence seq) {
  2841. return next.match(matcher, i, seq);
  2842. }
  2843. }
  2844. /**
  2845. * Used for REs that can start anywhere within the input string.
  2846. * This basically tries to match repeatedly at each spot in the
  2847. * input string, moving forward after each try. An anchored search
  2848. * or a BnM will bypass this node completely.
  2849. */
  2850. static class Start extends Node {
  2851. int minLength;
  2852. Start(Node node) {
  2853. this.next = node;
  2854. TreeInfo info = new TreeInfo();
  2855. next.study(info);
  2856. minLength = info.minLength;
  2857. }
  2858. boolean match(Matcher matcher, int i, CharSequence seq) {
  2859. if (i > matcher.to - minLength) {
  2860. matcher.hitEnd = true;
  2861. return false;
  2862. }
  2863. boolean ret = false;
  2864. int guard = matcher.to - minLength;
  2865. for (; i <= guard; i++) {
  2866. if (ret = next.match(matcher, i, seq))
  2867. break;
  2868. if (i == guard)
  2869. matcher.hitEnd = true;
  2870. }
  2871. if (ret) {
  2872. matcher.first = i;
  2873. matcher.groups[0] = matcher.first;
  2874. matcher.groups[1] = matcher.last;
  2875. }
  2876. return ret;
  2877. }
  2878. boolean study(TreeInfo info) {
  2879. next.study(info);
  2880. info.maxValid = false;
  2881. info.deterministic = false;
  2882. return false;
  2883. }
  2884. }
  2885. /*
  2886. * StartS supports supplementary characters, including unpaired surrogates.
  2887. */
  2888. static final class StartS extends Start {
  2889. StartS(Node node) {
  2890. super(node);
  2891. }
  2892. boolean match(Matcher matcher, int i, CharSequence seq) {
  2893. if (i > matcher.to - minLength) {
  2894. matcher.hitEnd = true;
  2895. return false;
  2896. }
  2897. boolean ret = false;
  2898. int guard = matcher.to - minLength;
  2899. while (i <= guard) {
  2900. if ((ret = next.match(matcher, i, seq)) || i == guard)
  2901. break;
  2902. // Optimization to move to the next character. This is
  2903. // faster than countChars(seq, i, 1).
  2904. if (Character.isHighSurrogate(seq.charAt(i++))) {
  2905. if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) {
  2906. i++;
  2907. }
  2908. }
  2909. if (i == guard)
  2910. matcher.hitEnd = true;
  2911. }
  2912. if (ret) {
  2913. matcher.first = i;
  2914. matcher.groups[0] = matcher.first;
  2915. matcher.groups[1] = matcher.last;
  2916. }
  2917. return ret;
  2918. }
  2919. }
  2920. /**
  2921. * Node to anchor at the beginning of input. This object implements the
  2922. * match for a \A sequence, and the caret anchor will use this if not in
  2923. * multiline mode.
  2924. */
  2925. static final class Begin extends Node {
  2926. boolean match(Matcher matcher, int i, CharSequence seq) {
  2927. int fromIndex = (matcher.anchoringBounds) ?
  2928. matcher.from : 0;
  2929. if (i == fromIndex && next.match(matcher, i, seq)) {
  2930. matcher.first = i;
  2931. matcher.groups[0] = i;
  2932. matcher.groups[1] = matcher.last;
  2933. return true;
  2934. } else {
  2935. return false;
  2936. }
  2937. }
  2938. }
  2939. /**
  2940. * Node to anchor at the end of input. This is the absolute end, so this
  2941. * should not match at the last newline before the end as $ will.
  2942. */
  2943. static final class End extends Node {
  2944. boolean match(Matcher matcher, int i, CharSequence seq) {
  2945. int endIndex = (matcher.anchoringBounds) ?
  2946. matcher.to : matcher.getTextLength();
  2947. if (i == endIndex) {
  2948. matcher.hitEnd = true;
  2949. return next.match(matcher, i, seq);
  2950. }
  2951. return false;
  2952. }
  2953. }
  2954. /**
  2955. * Node to anchor at the beginning of a line. This is essentially the
  2956. * object to match for the multiline ^.
  2957. */
  2958. static final class Caret extends Node {
  2959. boolean match(Matcher matcher, int i, CharSequence seq) {
  2960. int startIndex = matcher.from;
  2961. int endIndex = matcher.to;
  2962. if (!matcher.anchoringBounds) {
  2963. startIndex = 0;
  2964. endIndex = matcher.getTextLength();
  2965. }
  2966. // Perl does not match ^ at end of input even after newline
  2967. if (i == endIndex) {
  2968. matcher.hitEnd = true;
  2969. return false;
  2970. }
  2971. if (i > startIndex) {
  2972. char ch = seq.charAt(i-1);
  2973. if (ch != '\n' && ch != '\r'
  2974. && (ch|1) != '\u2029'
  2975. && ch != '\u0085' ) {
  2976. return false;
  2977. }
  2978. // Should treat /r/n as one newline
  2979. if (ch == '\r' && seq.charAt(i) == '\n')
  2980. return false;
  2981. }
  2982. return next.match(matcher, i, seq);
  2983. }
  2984. }
  2985. /**
  2986. * Node to anchor at the beginning of a line when in unixdot mode.
  2987. */
  2988. static final class UnixCaret extends Node {
  2989. boolean match(Matcher matcher, int i, CharSequence seq) {
  2990. int startIndex = matcher.from;
  2991. int endIndex = matcher.to;
  2992. if (!matcher.anchoringBounds) {
  2993. startIndex = 0;
  2994. endIndex = matcher.getTextLength();
  2995. }
  2996. // Perl does not match ^ at end of input even after newline
  2997. if (i == endIndex) {
  2998. matcher.hitEnd = true;
  2999. return false;
  3000. }
  3001. if (i > startIndex) {
  3002. char ch = seq.charAt(i-1);
  3003. if (ch != '\n') {
  3004. return false;
  3005. }
  3006. }
  3007. return next.match(matcher, i, seq);
  3008. }
  3009. }
  3010. /**
  3011. * Node to match the location where the last match ended.
  3012. * This is used for the \G construct.
  3013. */
  3014. static final class LastMatch extends Node {
  3015. boolean match(Matcher matcher, int i, CharSequence seq) {
  3016. if (i != matcher.oldLast)
  3017. return false;
  3018. return next.match(matcher, i, seq);
  3019. }
  3020. }
  3021. /**
  3022. * Node to anchor at the end of a line or the end of input based on the
  3023. * multiline mode.
  3024. *
  3025. * When not in multiline mode, the $ can only match at the very end
  3026. * of the input, unless the input ends in a line terminator in which
  3027. * it matches right before the last line terminator.
  3028. *
  3029. * Note that \r\n is considered an atomic line terminator.
  3030. *
  3031. * Like ^ the $ operator matches at a position, it does not match the
  3032. * line terminators themselves.
  3033. */
  3034. static final class Dollar extends Node {
  3035. boolean multiline;
  3036. Dollar(boolean mul) {
  3037. multiline = mul;
  3038. }
  3039. boolean match(Matcher matcher, int i, CharSequence seq) {
  3040. int endIndex = (matcher.anchoringBounds) ?
  3041. matcher.to : matcher.getTextLength();
  3042. if (!multiline) {
  3043. if (i < endIndex - 2)
  3044. return false;
  3045. if (i == endIndex - 2) {
  3046. char ch = seq.charAt(i);
  3047. if (ch != '\r')
  3048. return false;
  3049. ch = seq.charAt(i + 1);
  3050. if (ch != '\n')
  3051. return false;
  3052. }
  3053. }
  3054. // Matches before any line terminator; also matches at the
  3055. // end of input
  3056. // Before line terminator:
  3057. // If multiline, we match here no matter what
  3058. // If not multiline, fall through so that the end
  3059. // is marked as hit; this must be a /r/n or a /n
  3060. // at the very end so the end was hit; more input
  3061. // could make this not match here
  3062. if (i < endIndex) {
  3063. char ch = seq.charAt(i);
  3064. if (ch == '\n') {
  3065. // No match between \r\n
  3066. if (i > 0 && seq.charAt(i-1) == '\r')
  3067. return false;
  3068. if (multiline)
  3069. return next.match(matcher, i, seq);
  3070. } else if (ch == '\r' || ch == '\u0085' ||
  3071. (ch|1) == '\u2029') {
  3072. if (multiline)
  3073. return next.match(matcher, i, seq);
  3074. } else { // No line terminator, no match
  3075. return false;
  3076. }
  3077. }
  3078. // Matched at current end so hit end
  3079. matcher.hitEnd = true;
  3080. // If a $ matches because of end of input, then more input
  3081. // could cause it to fail!
  3082. matcher.requireEnd = true;
  3083. return next.match(matcher, i, seq);
  3084. }
  3085. boolean study(TreeInfo info) {
  3086. next.study(info);
  3087. return info.deterministic;
  3088. }
  3089. }
  3090. /**
  3091. * Node to anchor at the end of a line or the end of input based on the
  3092. * multiline mode when in unix lines mode.
  3093. */
  3094. static final class UnixDollar extends Node {
  3095. boolean multiline;
  3096. UnixDollar(boolean mul) {
  3097. multiline = mul;
  3098. }
  3099. boolean match(Matcher matcher, int i, CharSequence seq) {
  3100. int endIndex = (matcher.anchoringBounds) ?
  3101. matcher.to : matcher.getTextLength();
  3102. if (i < endIndex) {
  3103. char ch = seq.charAt(i);
  3104. if (ch == '\n') {
  3105. // If not multiline, then only possible to
  3106. // match at very end or one before end
  3107. if (multiline == false && i != endIndex - 1)
  3108. return false;
  3109. // If multiline return next.match without setting
  3110. // matcher.hitEnd
  3111. if (multiline)
  3112. return next.match(matcher, i, seq);
  3113. } else {
  3114. return false;
  3115. }
  3116. }
  3117. // Matching because at the end or 1 before the end;
  3118. // more input could change this so set hitEnd
  3119. matcher.hitEnd = true;
  3120. // If a $ matches because of end of input, then more input
  3121. // could cause it to fail!
  3122. matcher.requireEnd = true;
  3123. return next.match(matcher, i, seq);
  3124. }
  3125. boolean study(TreeInfo info) {
  3126. next.study(info);
  3127. return info.deterministic;
  3128. }
  3129. }
  3130. /**
  3131. * Node class for a single character value.
  3132. */
  3133. static final class Single extends Node {
  3134. int ch;
  3135. int len;
  3136. Single(int n) {
  3137. ch = n;
  3138. len = Character.charCount(ch);
  3139. }
  3140. Node dup(boolean not) {
  3141. if (not)
  3142. return new NotSingle(ch);
  3143. else
  3144. return new Single(ch);
  3145. }
  3146. boolean match(Matcher matcher, int i, CharSequence seq) {
  3147. if (i >= matcher.to) {
  3148. matcher.hitEnd = true;
  3149. return false;
  3150. }
  3151. int c = Character.codePointAt(seq, i);
  3152. return (c == ch
  3153. && next.match(matcher, i+len, seq));
  3154. }
  3155. boolean study(TreeInfo info) {
  3156. info.minLength++;
  3157. info.maxLength++;
  3158. return next.study(info);
  3159. }
  3160. }
  3161. /**
  3162. * Node class to match any character except a single char value.
  3163. */
  3164. static final class NotSingle extends Node {
  3165. int ch;
  3166. NotSingle(int n) {
  3167. ch = n;
  3168. }
  3169. Node dup(boolean not) {
  3170. if (not)
  3171. return new Single(ch);
  3172. else
  3173. return new NotSingle(ch);
  3174. }
  3175. boolean match(Matcher matcher, int i, CharSequence seq) {
  3176. if (i >= matcher.to) {
  3177. matcher.hitEnd = true;
  3178. return false;
  3179. }
  3180. int c = Character.codePointAt(seq, i);
  3181. return (c != ch
  3182. && next.match(matcher, i+Character.charCount(c), seq));
  3183. }
  3184. boolean study(TreeInfo info) {
  3185. info.minLength++;
  3186. info.maxLength++;
  3187. return next.study(info);
  3188. }
  3189. }
  3190. /**
  3191. * Case independent ASCII value.
  3192. */
  3193. static final class SingleA extends Node {
  3194. int ch;
  3195. SingleA(int n) {
  3196. ch = ASCII.toLower(n);
  3197. }
  3198. Node dup(boolean not) {
  3199. if (not)
  3200. return new NotSingleA(ch);
  3201. else
  3202. return new SingleA(ch);
  3203. }
  3204. boolean match(Matcher matcher, int i, CharSequence seq) {
  3205. if (i < matcher.to) {
  3206. int c = seq.charAt(i);
  3207. if (c == ch || ASCII.toLower(c) == ch) {
  3208. return next.match(matcher, i+1, seq);
  3209. }
  3210. }
  3211. matcher.hitEnd = true;
  3212. return false;
  3213. }
  3214. boolean study(TreeInfo info) {
  3215. info.minLength++;
  3216. info.maxLength++;
  3217. return next.study(info);
  3218. }
  3219. }
  3220. static final class NotSingleA extends Node {
  3221. int ch;
  3222. NotSingleA(int n) {
  3223. ch = ASCII.toLower(n);
  3224. }
  3225. Node dup(boolean not) {
  3226. if (not)
  3227. return new SingleA(ch);
  3228. else
  3229. return new NotSingleA(ch);
  3230. }
  3231. boolean match(Matcher matcher, int i, CharSequence seq) {
  3232. if (i < matcher.to) {
  3233. int c = Character.codePointAt(seq, i);
  3234. if (c != ch && ASCII.toLower(c) != ch) {
  3235. return next.match(matcher, i+Character.charCount(c), seq);
  3236. }
  3237. }
  3238. matcher.hitEnd = true;
  3239. return false;
  3240. }
  3241. boolean study(TreeInfo info) {
  3242. info.minLength++;
  3243. info.maxLength++;
  3244. return next.study(info);
  3245. }
  3246. }
  3247. /**
  3248. * Case independent unicode (including supplementary characters) value.
  3249. */
  3250. static final class SingleU extends Node {
  3251. int ch;
  3252. int len;
  3253. SingleU(int c) {
  3254. ch = Character.toLowerCase(Character.toUpperCase(c));
  3255. len = Character.charCount(ch);
  3256. }
  3257. Node dup(boolean not) {
  3258. if (not)
  3259. return new NotSingleU(ch);
  3260. else
  3261. return new SingleU(ch);
  3262. }
  3263. boolean match(Matcher matcher, int i, CharSequence seq) {
  3264. if (i < matcher.to) {
  3265. int c = Character.codePointAt(seq, i);
  3266. if (c == ch)
  3267. return next.match(matcher, i+len, seq);
  3268. int cc = Character.toUpperCase(c);
  3269. cc = Character.toLowerCase(cc);
  3270. if (cc == ch)
  3271. return next.match(matcher, i+Character.charCount(c), seq);
  3272. }
  3273. matcher.hitEnd = true;
  3274. return false;
  3275. }
  3276. boolean study(TreeInfo info) {
  3277. info.minLength++;
  3278. info.maxLength++;
  3279. return next.study(info);
  3280. }
  3281. }
  3282. /**
  3283. * Case independent unicode value.
  3284. */
  3285. static final class NotSingleU extends Node {
  3286. int ch;
  3287. NotSingleU(int c) {
  3288. ch = Character.toLowerCase(Character.toUpperCase((char)c));
  3289. }
  3290. Node dup(boolean not) {
  3291. if (not)
  3292. return new SingleU(ch);
  3293. else
  3294. return new NotSingleU(ch);
  3295. }
  3296. boolean match(Matcher matcher, int i, CharSequence seq) {
  3297. if (i < matcher.to) {
  3298. int c = Character.codePointAt(seq, i);
  3299. if (c == ch)
  3300. return false;
  3301. int cc = Character.toUpperCase(c);
  3302. cc = Character.toLowerCase(cc);
  3303. if (cc != ch)
  3304. return next.match(matcher, i+Character.charCount(c), seq);
  3305. }
  3306. matcher.hitEnd = true;
  3307. return false;
  3308. }
  3309. boolean study(TreeInfo info) {
  3310. info.minLength++;
  3311. info.maxLength++;
  3312. return next.study(info);
  3313. }
  3314. }
  3315. /**
  3316. * Node class that matches a Unicode category.
  3317. */
  3318. static final class Category extends Node {
  3319. int atype;
  3320. Category(int type) {
  3321. atype = type;
  3322. }
  3323. Node dup(boolean not) {
  3324. return new Category(not ? ~atype : atype);
  3325. }
  3326. boolean match(Matcher matcher, int i, CharSequence seq) {
  3327. if (i >= matcher.to) {
  3328. matcher.hitEnd = true;
  3329. return false;
  3330. }
  3331. int c = Character.codePointAt(seq, i);
  3332. return (atype & (1 << Character.getType(c))) != 0
  3333. && next.match(matcher, i+Character.charCount(c), seq);
  3334. }
  3335. boolean study(TreeInfo info) {
  3336. info.minLength++;
  3337. info.maxLength++;
  3338. return next.study(info);
  3339. }
  3340. }
  3341. /**
  3342. * Node class that matches a POSIX type.
  3343. */
  3344. static final class Ctype extends Node {
  3345. int ctype;
  3346. Ctype(int type) {
  3347. ctype = type;
  3348. }
  3349. Node dup(boolean not) {
  3350. if (not) {
  3351. return new NotCtype(ctype);
  3352. } else {
  3353. return new Ctype(ctype);
  3354. }
  3355. }
  3356. boolean match(Matcher matcher, int i, CharSequence seq) {
  3357. if (i >= matcher.to) {
  3358. matcher.hitEnd = true;
  3359. return false;
  3360. }
  3361. int c = Character.codePointAt(seq, i);
  3362. return (c < 128 && ASCII.isType(c, ctype)
  3363. && next.match(matcher, i+1, seq));
  3364. }
  3365. boolean study(TreeInfo info) {
  3366. info.minLength++;
  3367. info.maxLength++;
  3368. return next.study(info);
  3369. }
  3370. }
  3371. static final class NotCtype extends Node {
  3372. int ctype;
  3373. NotCtype(int type) {
  3374. ctype = type;
  3375. }
  3376. Node dup(boolean not) {
  3377. if (not) {
  3378. return new Ctype(ctype);
  3379. } else {
  3380. return new NotCtype(ctype);
  3381. }
  3382. }
  3383. boolean match(Matcher matcher, int i, CharSequence seq) {
  3384. if (i >= matcher.to) {
  3385. matcher.hitEnd = true;
  3386. return false;
  3387. }
  3388. int c = Character.codePointAt(seq, i);
  3389. return ((c >= 128 || !ASCII.isType(c, ctype))
  3390. && next.match(matcher, i+Character.charCount(c), seq));
  3391. }
  3392. boolean study(TreeInfo info) {
  3393. info.minLength++;
  3394. info.maxLength++;
  3395. return next.study(info);
  3396. }
  3397. }
  3398. static abstract class JavaTypeClass extends Node {
  3399. JavaTypeClass() {
  3400. }
  3401. Node dup(boolean not) {
  3402. Node duplicate = null;
  3403. try {
  3404. duplicate = (Node)this.getClass().newInstance();
  3405. } catch (InstantiationException ie) {
  3406. throw new Error("Cannot instantiate node");
  3407. } catch (IllegalAccessException iae) {
  3408. throw new Error("Cannot instantiate node");
  3409. }
  3410. if (not)
  3411. return new Not(duplicate);
  3412. else
  3413. return duplicate;
  3414. }
  3415. abstract boolean isProperty(int ch);
  3416. boolean match(Matcher matcher, int i, CharSequence seq) {
  3417. if (i >= matcher.to) {
  3418. matcher.hitEnd = true;
  3419. return false;
  3420. }
  3421. int c = Character.codePointAt(seq, i);
  3422. return (isProperty(c)
  3423. && next.match(matcher, i+Character.charCount(c), seq));
  3424. }
  3425. boolean study(TreeInfo info) {
  3426. info.minLength++;
  3427. info.maxLength++;
  3428. return next.study(info);
  3429. }
  3430. }
  3431. static final class JavaLowerCase extends JavaTypeClass {
  3432. JavaLowerCase() {
  3433. }
  3434. boolean isProperty(int ch) {
  3435. return Character.isLowerCase(ch);
  3436. }
  3437. }
  3438. static final class JavaUpperCase extends JavaTypeClass {
  3439. JavaUpperCase() {
  3440. }
  3441. boolean isProperty(int ch) {
  3442. return Character.isUpperCase(ch);
  3443. }
  3444. }
  3445. static final class JavaTitleCase extends JavaTypeClass {
  3446. JavaTitleCase() {
  3447. }
  3448. boolean isProperty(int ch) {
  3449. return Character.isTitleCase(ch);
  3450. }
  3451. }
  3452. static final class JavaDigit extends JavaTypeClass {
  3453. JavaDigit() {
  3454. }
  3455. boolean isProperty(int ch) {
  3456. return Character.isDigit(ch);
  3457. }
  3458. }
  3459. static final class JavaDefined extends JavaTypeClass {
  3460. JavaDefined() {
  3461. }
  3462. boolean isProperty(int ch) {
  3463. return Character.isDefined(ch);
  3464. }
  3465. }
  3466. static final class JavaLetter extends JavaTypeClass {
  3467. JavaLetter() {
  3468. }
  3469. boolean isProperty(int ch) {
  3470. return Character.isLetter(ch);
  3471. }
  3472. }
  3473. static final class JavaLetterOrDigit extends JavaTypeClass {
  3474. JavaLetterOrDigit() {
  3475. }
  3476. boolean isProperty(int ch) {
  3477. return Character.isLetterOrDigit(ch);
  3478. }
  3479. }
  3480. static final class JavaJavaIdentifierStart extends JavaTypeClass {
  3481. JavaJavaIdentifierStart() {
  3482. }
  3483. boolean isProperty(int ch) {
  3484. return Character.isJavaIdentifierStart(ch);
  3485. }
  3486. }
  3487. static final class JavaJavaIdentifierPart extends JavaTypeClass {
  3488. JavaJavaIdentifierPart() {
  3489. }
  3490. boolean isProperty(int ch) {
  3491. return Character.isJavaIdentifierPart(ch);
  3492. }
  3493. }
  3494. static final class JavaUnicodeIdentifierStart extends JavaTypeClass {
  3495. JavaUnicodeIdentifierStart() {
  3496. }
  3497. boolean isProperty(int ch) {
  3498. return Character.isUnicodeIdentifierStart(ch);
  3499. }
  3500. }
  3501. static final class JavaUnicodeIdentifierPart extends JavaTypeClass {
  3502. JavaUnicodeIdentifierPart() {
  3503. }
  3504. boolean isProperty(int ch) {
  3505. return Character.isUnicodeIdentifierPart(ch);
  3506. }
  3507. }
  3508. static final class JavaIdentifierIgnorable extends JavaTypeClass {
  3509. JavaIdentifierIgnorable() {
  3510. }
  3511. boolean isProperty(int ch) {
  3512. return Character.isIdentifierIgnorable(ch);
  3513. }
  3514. }
  3515. static final class JavaSpaceChar extends JavaTypeClass {
  3516. JavaSpaceChar() {
  3517. }
  3518. boolean isProperty(int ch) {
  3519. return Character.isSpaceChar(ch);
  3520. }
  3521. }
  3522. static final class JavaWhitespace extends JavaTypeClass {
  3523. JavaWhitespace() {
  3524. }
  3525. boolean isProperty(int ch) {
  3526. return Character.isWhitespace(ch);
  3527. }
  3528. }
  3529. static final class JavaISOControl extends JavaTypeClass {
  3530. JavaISOControl() {
  3531. }
  3532. boolean isProperty(int ch) {
  3533. return Character.isISOControl(ch);
  3534. }
  3535. }
  3536. static final class JavaMirrored extends JavaTypeClass {
  3537. JavaMirrored() {
  3538. }
  3539. boolean isProperty(int ch) {
  3540. return Character.isMirrored(ch);
  3541. }
  3542. }
  3543. static final class Specials extends Node {
  3544. Specials() {
  3545. }
  3546. Node dup(boolean not) {
  3547. if (not)
  3548. return new Not(this);
  3549. else
  3550. return new Specials();
  3551. }
  3552. boolean match(Matcher matcher, int i, CharSequence seq) {
  3553. if (i < matcher.to) {
  3554. int ch = seq.charAt(i);
  3555. return (((ch-0xFFF0) | (0xFFFD-ch)) >= 0 || ch == 0xFEFF)
  3556. && next.match(matcher, i+1, seq);
  3557. }
  3558. matcher.hitEnd = true;
  3559. return false;
  3560. }
  3561. boolean study(TreeInfo info) {
  3562. info.minLength++;
  3563. info.maxLength++;
  3564. return next.study(info);
  3565. }
  3566. }
  3567. static final class Not extends Node {
  3568. Node atom;
  3569. Not(Node atom) {
  3570. this.atom = atom;
  3571. }
  3572. boolean match(Matcher matcher, int i, CharSequence seq) {
  3573. return !atom.match(matcher, i, seq)
  3574. && next.match(matcher, i+countChars(seq, i, 1), seq);
  3575. }
  3576. boolean study(TreeInfo info) {
  3577. info.minLength++;
  3578. info.maxLength++;
  3579. return next.study(info);
  3580. }
  3581. }
  3582. /**
  3583. * Node class for a case sensitive sequence of literal characters.
  3584. */
  3585. static class Slice extends Node {
  3586. int[] buffer;
  3587. Slice(int[] buf) {
  3588. buffer = buf;
  3589. }
  3590. boolean match(Matcher matcher, int i, CharSequence seq) {
  3591. int[] buf = buffer;
  3592. int len = buf.length;
  3593. // Unfortunately we must now void this opto
  3594. // in order to properly report hitEnd...
  3595. //if (i + len > matcher.to) {
  3596. // matcher.hitEnd = true;
  3597. // return false;
  3598. //}
  3599. for (int j=0; j<len; j++) {
  3600. if ((i+j) >= matcher.to) {
  3601. matcher.hitEnd = true;
  3602. return false;
  3603. }
  3604. if (buf[j] != seq.charAt(i+j))
  3605. return false;
  3606. }
  3607. return next.match(matcher, i+len, seq);
  3608. }
  3609. boolean study(TreeInfo info) {
  3610. info.minLength += buffer.length;
  3611. info.maxLength += buffer.length;
  3612. return next.study(info);
  3613. }
  3614. }
  3615. /**
  3616. * Node class for a case insensitive sequence of literal characters.
  3617. */
  3618. static final class SliceA extends Node {
  3619. int[] buffer;
  3620. SliceA(int[] buf) {
  3621. buffer = buf;
  3622. }
  3623. boolean match(Matcher matcher, int i, CharSequence seq) {
  3624. int[] buf = buffer;
  3625. int len = buf.length;
  3626. for (int j=0; j<len; j++) {
  3627. if ((i+j) >= matcher.to) {
  3628. matcher.hitEnd = true;
  3629. return false;
  3630. }
  3631. int c = ASCII.toLower(seq.charAt(i+j));
  3632. if (buf[j] != c)
  3633. return false;
  3634. }
  3635. return next.match(matcher, i+len, seq);
  3636. }
  3637. boolean study(TreeInfo info) {
  3638. info.minLength += buffer.length;
  3639. info.maxLength += buffer.length;
  3640. return next.study(info);
  3641. }
  3642. }
  3643. /**
  3644. * Node class for a case insensitive sequence of literal characters.
  3645. * Uses unicode case folding.
  3646. */
  3647. static final class SliceU extends Node {
  3648. int[] buffer;
  3649. SliceU(int[] buf) {
  3650. buffer = buf;
  3651. }
  3652. boolean match(Matcher matcher, int i, CharSequence seq) {
  3653. int[] buf = buffer;
  3654. int x = i;
  3655. for (int j = 0; j < buf.length; j++) {
  3656. if (x >= matcher.to) {
  3657. matcher.hitEnd = true;
  3658. return false;
  3659. }
  3660. int c = Character.codePointAt(seq, x);
  3661. int cc = Character.toUpperCase(c);
  3662. cc = Character.toLowerCase(cc);
  3663. if (buf[j] != cc) {
  3664. return false;
  3665. }
  3666. x += Character.charCount(c);
  3667. if (x > matcher.to) {
  3668. matcher.hitEnd = true;
  3669. return false;
  3670. }
  3671. }
  3672. return next.match(matcher, x, seq);
  3673. }
  3674. boolean study(TreeInfo info) {
  3675. info.minLength += buffer.length;
  3676. info.maxLength += buffer.length;
  3677. return next.study(info);
  3678. }
  3679. }
  3680. /**
  3681. * Node class for a case sensitive sequence of literal characters
  3682. * including supplementary characters.
  3683. */
  3684. static final class SliceS extends Slice {
  3685. SliceS(int[] buf) {
  3686. super(buf);
  3687. }
  3688. boolean match(Matcher matcher, int i, CharSequence seq) {
  3689. int[] buf = buffer;
  3690. int x = i;
  3691. for (int j = 0; j < buf.length; j++) {
  3692. if (x >= matcher.to) {
  3693. matcher.hitEnd = true;
  3694. return false;
  3695. }
  3696. int c = Character.codePointAt(seq, x);
  3697. if (buf[j] != c)
  3698. return false;
  3699. x += Character.charCount(c);
  3700. if (x > matcher.to) {
  3701. matcher.hitEnd = true;
  3702. return false;
  3703. }
  3704. }
  3705. return next.match(matcher, x, seq);
  3706. }
  3707. }
  3708. /**
  3709. * Node class for matching characters within an explicit value range.
  3710. */
  3711. static class Range extends Node {
  3712. int lower, upper;
  3713. Range() {
  3714. }
  3715. Range(int n) {
  3716. lower = n >>> 16;
  3717. upper = n & 0xFFFF;
  3718. }
  3719. Range(int lower, int upper) {
  3720. this.lower = lower;
  3721. this.upper = upper;
  3722. }
  3723. Node dup(boolean not) {
  3724. if (not)
  3725. return new NotRange(lower, upper);
  3726. else
  3727. return new Range(lower, upper);
  3728. }
  3729. boolean match(Matcher matcher, int i, CharSequence seq) {
  3730. if (i < matcher.to) {
  3731. int ch = Character.codePointAt(seq, i);
  3732. return ((ch-lower)|(upper-ch)) >= 0
  3733. && next.match(matcher, i+Character.charCount(ch), seq);
  3734. }
  3735. matcher.hitEnd = true;
  3736. return false;
  3737. }
  3738. boolean study(TreeInfo info) {
  3739. info.minLength++;
  3740. info.maxLength++;
  3741. return next.study(info);
  3742. }
  3743. }
  3744. /**
  3745. * Node class for matching characters within an explicit value range
  3746. * in a case insensitive manner.
  3747. */
  3748. static final class CIRange extends Range {
  3749. CIRange(int n) {
  3750. lower = n >>> 16;
  3751. upper = n & 0xFFFF;
  3752. }
  3753. CIRange(int lower, int upper) {
  3754. super(lower, upper);
  3755. }
  3756. Node dup(boolean not) {
  3757. if (not)
  3758. return new CINotRange(lower, upper);
  3759. else
  3760. return new CIRange(lower, upper);
  3761. }
  3762. boolean match(Matcher matcher, int i, CharSequence seq) {
  3763. if (i < matcher.to) {
  3764. int ch = Character.codePointAt(seq, i);
  3765. boolean m = (((ch-lower)|(upper-ch)) >= 0);
  3766. if (!m) {
  3767. int cc = Character.toUpperCase(ch);
  3768. m = (((cc-lower)|(upper-cc)) >= 0);
  3769. if (!m) {
  3770. cc = Character.toLowerCase(cc);
  3771. m = (((cc-lower)|(upper-cc)) >= 0);
  3772. }
  3773. }
  3774. return (m && next.match(matcher, i+Character.charCount(ch), seq));
  3775. }
  3776. matcher.hitEnd = true;
  3777. return false;
  3778. }
  3779. }
  3780. static class NotRange extends Node {
  3781. int lower, upper;
  3782. NotRange() {
  3783. }
  3784. NotRange(int n) {
  3785. lower = n >>> 16;
  3786. upper = n & 0xFFFF;
  3787. }
  3788. NotRange(int lower, int upper) {
  3789. this.lower = lower;
  3790. this.upper = upper;
  3791. }
  3792. Node dup(boolean not) {
  3793. if (not) {
  3794. return new Range(lower, upper);
  3795. } else {
  3796. return new NotRange(lower, upper);
  3797. }
  3798. }
  3799. boolean match(Matcher matcher, int i, CharSequence seq) {
  3800. if (i < matcher.to) {
  3801. int ch = Character.codePointAt(seq, i);
  3802. return ((ch-lower)|(upper-ch)) < 0
  3803. && next.match(matcher, i+Character.charCount(ch), seq);
  3804. }
  3805. matcher.hitEnd = true;
  3806. return false;
  3807. }
  3808. boolean study(TreeInfo info) {
  3809. info.minLength++;
  3810. info.maxLength++;
  3811. return next.study(info);
  3812. }
  3813. }
  3814. static class CINotRange extends NotRange {
  3815. int lower, upper;
  3816. CINotRange(int n) {
  3817. lower = n >>> 16;
  3818. upper = n & 0xFFFF;
  3819. }
  3820. CINotRange(int lower, int upper) {
  3821. super(lower, upper);
  3822. }
  3823. Node dup(boolean not) {
  3824. if (not) {
  3825. return new CIRange(lower, upper);
  3826. } else {
  3827. return new CINotRange(lower, upper);
  3828. }
  3829. }
  3830. boolean match(Matcher matcher, int i, CharSequence seq) {
  3831. if (i < matcher.to) {
  3832. int ch = Character.codePointAt(seq, i);
  3833. boolean m = (((ch-lower)|(upper-ch)) < 0);
  3834. if (m) {
  3835. int cc = Character.toUpperCase(ch);
  3836. m = (((cc-lower)|(upper-cc)) < 0);
  3837. if (m) {
  3838. cc = Character.toLowerCase(cc);
  3839. m = (((cc-lower)|(upper-cc)) < 0);
  3840. }
  3841. }
  3842. return (m && next.match(matcher, i+Character.charCount(ch), seq));
  3843. }
  3844. matcher.hitEnd = true;
  3845. return false;
  3846. }
  3847. }
  3848. /**
  3849. * Implements the Unicode category ALL and the dot metacharacter when
  3850. * in dotall mode.
  3851. */
  3852. static final class All extends Node {
  3853. All() {
  3854. super();
  3855. }
  3856. Node dup(boolean not) {
  3857. if (not) {
  3858. return new Single(-1);
  3859. } else {
  3860. return new All();
  3861. }
  3862. }
  3863. boolean match(Matcher matcher, int i, CharSequence seq) {
  3864. if (i < matcher.to) {
  3865. return next.match(matcher, i+countChars(seq, i, 1), seq);
  3866. }
  3867. matcher.hitEnd = true;
  3868. return false;
  3869. }
  3870. boolean study(TreeInfo info) {
  3871. info.minLength++;
  3872. info.maxLength++;
  3873. return next.study(info);
  3874. }
  3875. }
  3876. /**
  3877. * Node class for the dot metacharacter when dotall is not enabled.
  3878. */
  3879. static final class Dot extends Node {
  3880. Dot() {
  3881. super();
  3882. }
  3883. boolean match(Matcher matcher, int i, CharSequence seq) {
  3884. if (i < matcher.to) {
  3885. int ch = Character.codePointAt(seq, i);
  3886. return (ch != '\n' && ch != '\r'
  3887. && (ch|1) != '\u2029'
  3888. && ch != '\u0085'
  3889. && next.match(matcher, i+Character.charCount(ch), seq));
  3890. }
  3891. matcher.hitEnd = true;
  3892. return false;
  3893. }
  3894. boolean study(TreeInfo info) {
  3895. info.minLength++;
  3896. info.maxLength++;
  3897. return next.study(info);
  3898. }
  3899. }
  3900. /**
  3901. * Node class for the dot metacharacter when dotall is not enabled
  3902. * but UNIX_LINES is enabled.
  3903. */
  3904. static final class UnixDot extends Node {
  3905. UnixDot() {
  3906. super();
  3907. }
  3908. boolean match(Matcher matcher, int i, CharSequence seq) {
  3909. if (i < matcher.to) {
  3910. int ch = Character.codePointAt(seq, i);
  3911. return (ch != '\n' && next.match(matcher, i+Character.charCount(ch), seq));
  3912. }
  3913. matcher.hitEnd = true;
  3914. return false;
  3915. }
  3916. boolean study(TreeInfo info) {
  3917. info.minLength++;
  3918. info.maxLength++;
  3919. return next.study(info);
  3920. }
  3921. }
  3922. /**
  3923. * The 0 or 1 quantifier. This one class implements all three types.
  3924. */
  3925. static final class Ques extends Node {
  3926. Node atom;
  3927. int type;
  3928. Ques(Node node, int type) {
  3929. this.atom = node;
  3930. this.type = type;
  3931. }
  3932. boolean match(Matcher matcher, int i, CharSequence seq) {
  3933. switch (type) {
  3934. case GREEDY:
  3935. return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
  3936. || next.match(matcher, i, seq);
  3937. case LAZY:
  3938. return next.match(matcher, i, seq)
  3939. || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
  3940. case POSSESSIVE:
  3941. if (atom.match(matcher, i, seq)) i = matcher.last;
  3942. return next.match(matcher, i, seq);
  3943. default:
  3944. return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
  3945. }
  3946. }
  3947. boolean study(TreeInfo info) {
  3948. if (type != INDEPENDENT) {
  3949. int minL = info.minLength;
  3950. atom.study(info);
  3951. info.minLength = minL;
  3952. info.deterministic = false;
  3953. return next.study(info);
  3954. } else {
  3955. atom.study(info);
  3956. return next.study(info);
  3957. }
  3958. }
  3959. }
  3960. /**
  3961. * Handles the curly-brace style repetition with a specified minimum and
  3962. * maximum occurrences. The * quantifier is handled as a special case.
  3963. * This class handles the three types.
  3964. */
  3965. static final class Curly extends Node {
  3966. Node atom;
  3967. int type;
  3968. int cmin;
  3969. int cmax;
  3970. Curly(Node node, int cmin, int cmax, int type) {
  3971. this.atom = node;
  3972. this.type = type;
  3973. this.cmin = cmin;
  3974. this.cmax = cmax;
  3975. }
  3976. boolean match(Matcher matcher, int i, CharSequence seq) {
  3977. int j;
  3978. for (j = 0; j < cmin; j++) {
  3979. if (atom.match(matcher, i, seq)) {
  3980. i = matcher.last;
  3981. continue;
  3982. }
  3983. return false;
  3984. }
  3985. if (type == GREEDY)
  3986. return match0(matcher, i, j, seq);
  3987. else if (type == LAZY)
  3988. return match1(matcher, i, j, seq);
  3989. else
  3990. return match2(matcher, i, j, seq);
  3991. }
  3992. // Greedy match.
  3993. // i is the index to start matching at
  3994. // j is the number of atoms that have matched
  3995. boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
  3996. if (j >= cmax) {
  3997. // We have matched the maximum... continue with the rest of
  3998. // the regular expression
  3999. return next.match(matcher, i, seq);
  4000. }
  4001. int backLimit = j;
  4002. while (atom.match(matcher, i, seq)) {
  4003. // k is the length of this match
  4004. int k = matcher.last - i;
  4005. if (k == 0) // Zero length match
  4006. break;
  4007. // Move up index and number matched
  4008. i = matcher.last;
  4009. j++;
  4010. // We are greedy so match as many as we can
  4011. while (j < cmax) {
  4012. if (!atom.match(matcher, i, seq))
  4013. break;
  4014. if (i + k != matcher.last) {
  4015. if (match0(matcher, matcher.last, j+1, seq))
  4016. return true;
  4017. break;
  4018. }
  4019. i += k;
  4020. j++;
  4021. }
  4022. // Handle backing off if match fails
  4023. while (j >= backLimit) {
  4024. if (next.match(matcher, i, seq))
  4025. return true;
  4026. i -= k;
  4027. j--;
  4028. }
  4029. return false;
  4030. }
  4031. return next.match(matcher, i, seq);
  4032. }
  4033. // Reluctant match. At this point, the minimum has been satisfied.
  4034. // i is the index to start matching at
  4035. // j is the number of atoms that have matched
  4036. boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
  4037. for (;;) {
  4038. // Try finishing match without consuming any more
  4039. if (next.match(matcher, i, seq))
  4040. return true;
  4041. // At the maximum, no match found
  4042. if (j >= cmax)
  4043. return false;
  4044. // Okay, must try one more atom
  4045. if (!atom.match(matcher, i, seq))
  4046. return false;
  4047. // If we haven't moved forward then must break out
  4048. if (i == matcher.last)
  4049. return false;
  4050. // Move up index and number matched
  4051. i = matcher.last;
  4052. j++;
  4053. }
  4054. }
  4055. boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
  4056. for (; j < cmax; j++) {
  4057. if (!atom.match(matcher, i, seq))
  4058. break;
  4059. if (i == matcher.last)
  4060. break;
  4061. i = matcher.last;
  4062. }
  4063. return next.match(matcher, i, seq);
  4064. }
  4065. boolean study(TreeInfo info) {
  4066. // Save original info
  4067. int minL = info.minLength;
  4068. int maxL = info.maxLength;
  4069. boolean maxV = info.maxValid;
  4070. boolean detm = info.deterministic;
  4071. info.reset();
  4072. atom.study(info);
  4073. int temp = info.minLength * cmin + minL;
  4074. if (temp < minL) {
  4075. temp = 0xFFFFFFF; // arbitrary large number
  4076. }
  4077. info.minLength = temp;
  4078. if (maxV & info.maxValid) {
  4079. temp = info.maxLength * cmax + maxL;
  4080. info.maxLength = temp;
  4081. if (temp < maxL) {
  4082. info.maxValid = false;
  4083. }
  4084. } else {
  4085. info.maxValid = false;
  4086. }
  4087. if (info.deterministic && cmin == cmax)
  4088. info.deterministic = detm;
  4089. else
  4090. info.deterministic = false;
  4091. return next.study(info);
  4092. }
  4093. }
  4094. /**
  4095. * Handles the curly-brace style repetition with a specified minimum and
  4096. * maximum occurrences in deterministic cases. This is an iterative
  4097. * optimization over the Prolog and Loop system which would handle this
  4098. * in a recursive way. The * quantifier is handled as a special case.
  4099. * If capture is true then this class saves group settings and ensures
  4100. * that groups are unset when backing off of a group match.
  4101. */
  4102. static final class GroupCurly extends Node {
  4103. Node atom;
  4104. int type;
  4105. int cmin;
  4106. int cmax;
  4107. int localIndex;
  4108. int groupIndex;
  4109. boolean capture;
  4110. GroupCurly(Node node, int cmin, int cmax, int type, int local,
  4111. int group, boolean capture) {
  4112. this.atom = node;
  4113. this.type = type;
  4114. this.cmin = cmin;
  4115. this.cmax = cmax;
  4116. this.localIndex = local;
  4117. this.groupIndex = group;
  4118. this.capture = capture;
  4119. }
  4120. boolean match(Matcher matcher, int i, CharSequence seq) {
  4121. int[] groups = matcher.groups;
  4122. int[] locals = matcher.locals;
  4123. int save0 = locals[localIndex];
  4124. int save1 = 0;
  4125. int save2 = 0;
  4126. if (capture) {
  4127. save1 = groups[groupIndex];
  4128. save2 = groups[groupIndex+1];
  4129. }
  4130. // Notify GroupTail there is no need to setup group info
  4131. // because it will be set here
  4132. locals[localIndex] = -1;
  4133. boolean ret = true;
  4134. for (int j = 0; j < cmin; j++) {
  4135. if (atom.match(matcher, i, seq)) {
  4136. if (capture) {
  4137. groups[groupIndex] = i;
  4138. groups[groupIndex+1] = matcher.last;
  4139. }
  4140. i = matcher.last;
  4141. } else {
  4142. ret = false;
  4143. break;
  4144. }
  4145. }
  4146. if (!ret) {
  4147. locals[localIndex] = save0;
  4148. if (capture) {
  4149. groups[groupIndex] = save1;
  4150. groups[groupIndex+1] = save2;
  4151. }
  4152. } else if (type == GREEDY) {
  4153. ret = match0(matcher, i, cmin, seq);
  4154. } else if (type == LAZY) {
  4155. ret = match1(matcher, i, cmin, seq);
  4156. } else {
  4157. ret = match2(matcher, i, cmin, seq);
  4158. }
  4159. return ret;
  4160. }
  4161. // Aggressive group match
  4162. boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
  4163. int[] groups = matcher.groups;
  4164. int save0 = 0;
  4165. int save1 = 0;
  4166. if (capture) {
  4167. save0 = groups[groupIndex];
  4168. save1 = groups[groupIndex+1];
  4169. }
  4170. for (;;) {
  4171. if (j >= cmax)
  4172. break;
  4173. if (!atom.match(matcher, i, seq))
  4174. break;
  4175. int k = matcher.last - i;
  4176. if (k <= 0) {
  4177. if (capture) {
  4178. groups[groupIndex] = i;
  4179. groups[groupIndex+1] = i + k;
  4180. }
  4181. i = i + k;
  4182. break;
  4183. }
  4184. for (;;) {
  4185. if (capture) {
  4186. groups[groupIndex] = i;
  4187. groups[groupIndex+1] = i + k;
  4188. }
  4189. i = i + k;
  4190. if (++j >= cmax)
  4191. break;
  4192. if (!atom.match(matcher, i, seq))
  4193. break;
  4194. if (i + k != matcher.last) {
  4195. if (match0(matcher, i, j, seq))
  4196. return true;
  4197. break;
  4198. }
  4199. }
  4200. while (j > cmin) {
  4201. if (next.match(matcher, i, seq)) {
  4202. if (capture) {
  4203. groups[groupIndex+1] = i;
  4204. groups[groupIndex] = i - k;
  4205. }
  4206. i = i - k;
  4207. return true;
  4208. }
  4209. // backing off
  4210. if (capture) {
  4211. groups[groupIndex+1] = i;
  4212. groups[groupIndex] = i - k;
  4213. }
  4214. i = i - k;
  4215. j--;
  4216. }
  4217. break;
  4218. }
  4219. if (capture) {
  4220. groups[groupIndex] = save0;
  4221. groups[groupIndex+1] = save1;
  4222. }
  4223. return next.match(matcher, i, seq);
  4224. }
  4225. // Reluctant matching
  4226. boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
  4227. for (;;) {
  4228. if (next.match(matcher, i, seq))
  4229. return true;
  4230. if (j >= cmax)
  4231. return false;
  4232. if (!atom.match(matcher, i, seq))
  4233. return false;
  4234. if (i == matcher.last)
  4235. return false;
  4236. if (capture) {
  4237. matcher.groups[groupIndex] = i;
  4238. matcher.groups[groupIndex+1] = matcher.last;
  4239. }
  4240. i = matcher.last;
  4241. j++;
  4242. }
  4243. }
  4244. // Possessive matching
  4245. boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
  4246. for (; j < cmax; j++) {
  4247. if (!atom.match(matcher, i, seq)) {
  4248. break;
  4249. }
  4250. if (capture) {
  4251. matcher.groups[groupIndex] = i;
  4252. matcher.groups[groupIndex+1] = matcher.last;
  4253. }
  4254. if (i == matcher.last) {
  4255. break;
  4256. }
  4257. i = matcher.last;
  4258. }
  4259. return next.match(matcher, i, seq);
  4260. }
  4261. boolean study(TreeInfo info) {
  4262. // Save original info
  4263. int minL = info.minLength;
  4264. int maxL = info.maxLength;
  4265. boolean maxV = info.maxValid;
  4266. boolean detm = info.deterministic;
  4267. info.reset();
  4268. atom.study(info);
  4269. int temp = info.minLength * cmin + minL;
  4270. if (temp < minL) {
  4271. temp = 0xFFFFFFF; // Arbitrary large number
  4272. }
  4273. info.minLength = temp;
  4274. if (maxV & info.maxValid) {
  4275. temp = info.maxLength * cmax + maxL;
  4276. info.maxLength = temp;
  4277. if (temp < maxL) {
  4278. info.maxValid = false;
  4279. }
  4280. } else {
  4281. info.maxValid = false;
  4282. }
  4283. if (info.deterministic && cmin == cmax) {
  4284. info.deterministic = detm;
  4285. } else {
  4286. info.deterministic = false;
  4287. }
  4288. return next.study(info);
  4289. }
  4290. }
  4291. /**
  4292. * Handles the branching of alternations. Note this is also used for
  4293. * the ? quantifier to branch between the case where it matches once
  4294. * and where it does not occur.
  4295. */
  4296. static final class Branch extends Node {
  4297. Node prev;
  4298. Branch(Node lhs, Node rhs) {
  4299. this.prev = lhs;
  4300. this.next = rhs;
  4301. }
  4302. boolean match(Matcher matcher, int i, CharSequence seq) {
  4303. return (prev.match(matcher, i, seq) || next.match(matcher, i, seq));
  4304. }
  4305. boolean study(TreeInfo info) {
  4306. int minL = info.minLength;
  4307. int maxL = info.maxLength;
  4308. boolean maxV = info.maxValid;
  4309. info.reset();
  4310. prev.study(info);
  4311. int minL2 = info.minLength;
  4312. int maxL2 = info.maxLength;
  4313. boolean maxV2 = info.maxValid;
  4314. info.reset();
  4315. next.study(info);
  4316. info.minLength = minL + Math.min(minL2, info.minLength);
  4317. info.maxLength = maxL + Math.max(maxL2, info.maxLength);
  4318. info.maxValid = (maxV & maxV2 & info.maxValid);
  4319. info.deterministic = false;
  4320. return false;
  4321. }
  4322. }
  4323. /**
  4324. * The GroupHead saves the location where the group begins in the locals
  4325. * and restores them when the match is done.
  4326. *
  4327. * The matchRef is used when a reference to this group is accessed later
  4328. * in the expression. The locals will have a negative value in them to
  4329. * indicate that we do not want to unset the group if the reference
  4330. * doesn't match.
  4331. */
  4332. static final class GroupHead extends Node {
  4333. int localIndex;
  4334. GroupHead(int localCount) {
  4335. localIndex = localCount;
  4336. }
  4337. boolean match(Matcher matcher, int i, CharSequence seq) {
  4338. int save = matcher.locals[localIndex];
  4339. matcher.locals[localIndex] = i;
  4340. boolean ret = next.match(matcher, i, seq);
  4341. matcher.locals[localIndex] = save;
  4342. return ret;
  4343. }
  4344. boolean matchRef(Matcher matcher, int i, CharSequence seq) {
  4345. int save = matcher.locals[localIndex];
  4346. matcher.locals[localIndex] = ~i; // HACK
  4347. boolean ret = next.match(matcher, i, seq);
  4348. matcher.locals[localIndex] = save;
  4349. return ret;
  4350. }
  4351. }
  4352. /**
  4353. * Recursive reference to a group in the regular expression. It calls
  4354. * matchRef because if the reference fails to match we would not unset
  4355. * the group.
  4356. */
  4357. static final class GroupRef extends Node {
  4358. GroupHead head;
  4359. GroupRef(GroupHead head) {
  4360. this.head = head;
  4361. }
  4362. boolean match(Matcher matcher, int i, CharSequence seq) {
  4363. return head.matchRef(matcher, i, seq)
  4364. && next.match(matcher, matcher.last, seq);
  4365. }
  4366. boolean study(TreeInfo info) {
  4367. info.maxValid = false;
  4368. info.deterministic = false;
  4369. return next.study(info);
  4370. }
  4371. }
  4372. /**
  4373. * The GroupTail handles the setting of group beginning and ending
  4374. * locations when groups are successfully matched. It must also be able to
  4375. * unset groups that have to be backed off of.
  4376. *
  4377. * The GroupTail node is also used when a previous group is referenced,
  4378. * and in that case no group information needs to be set.
  4379. */
  4380. static final class GroupTail extends Node {
  4381. int localIndex;
  4382. int groupIndex;
  4383. GroupTail(int localCount, int groupCount) {
  4384. localIndex = localCount;
  4385. groupIndex = groupCount + groupCount;
  4386. }
  4387. boolean match(Matcher matcher, int i, CharSequence seq) {
  4388. int tmp = matcher.locals[localIndex];
  4389. if (tmp >= 0) { // This is the normal group case.
  4390. // Save the group so we can unset it if it
  4391. // backs off of a match.
  4392. int groupStart = matcher.groups[groupIndex];
  4393. int groupEnd = matcher.groups[groupIndex+1];
  4394. matcher.groups[groupIndex] = tmp;
  4395. matcher.groups[groupIndex+1] = i;
  4396. if (next.match(matcher, i, seq)) {
  4397. return true;
  4398. }
  4399. matcher.groups[groupIndex] = groupStart;
  4400. matcher.groups[groupIndex+1] = groupEnd;
  4401. return false;
  4402. } else {
  4403. // This is a group reference case. We don't need to save any
  4404. // group info because it isn't really a group.
  4405. matcher.last = i;
  4406. return true;
  4407. }
  4408. }
  4409. }
  4410. /**
  4411. * This sets up a loop to handle a recursive quantifier structure.
  4412. */
  4413. static final class Prolog extends Node {
  4414. Loop loop;
  4415. Prolog(Loop loop) {
  4416. this.loop = loop;
  4417. }
  4418. boolean match(Matcher matcher, int i, CharSequence seq) {
  4419. return loop.matchInit(matcher, i, seq);
  4420. }
  4421. boolean study(TreeInfo info) {
  4422. return loop.study(info);
  4423. }
  4424. }
  4425. /**
  4426. * Handles the repetition count for a greedy Curly. The matchInit
  4427. * is called from the Prolog to save the index of where the group
  4428. * beginning is stored. A zero length group check occurs in the
  4429. * normal match but is skipped in the matchInit.
  4430. */
  4431. static class Loop extends Node {
  4432. Node body;
  4433. int countIndex; // local count index in matcher locals
  4434. int beginIndex; // group beginning index
  4435. int cmin, cmax;
  4436. Loop(int countIndex, int beginIndex) {
  4437. this.countIndex = countIndex;
  4438. this.beginIndex = beginIndex;
  4439. }
  4440. boolean match(Matcher matcher, int i, CharSequence seq) {
  4441. // Avoid infinite loop in zero-length case.
  4442. if (i > matcher.locals[beginIndex]) {
  4443. int count = matcher.locals[countIndex];
  4444. // This block is for before we reach the minimum
  4445. // iterations required for the loop to match
  4446. if (count < cmin) {
  4447. matcher.locals[countIndex] = count + 1;
  4448. boolean b = body.match(matcher, i, seq);
  4449. // If match failed we must backtrack, so
  4450. // the loop count should NOT be incremented
  4451. if (!b)
  4452. matcher.locals[countIndex] = count;
  4453. // Return success or failure since we are under
  4454. // minimum
  4455. return b;
  4456. }
  4457. // This block is for after we have the minimum
  4458. // iterations required for the loop to match
  4459. if (count < cmax) {
  4460. matcher.locals[countIndex] = count + 1;
  4461. boolean b = body.match(matcher, i, seq);
  4462. // If match failed we must backtrack, so
  4463. // the loop count should NOT be incremented
  4464. if (!b)
  4465. matcher.locals[countIndex] = count;
  4466. else
  4467. return true;
  4468. }
  4469. }
  4470. return next.match(matcher, i, seq);
  4471. }
  4472. boolean matchInit(Matcher matcher, int i, CharSequence seq) {
  4473. int save = matcher.locals[countIndex];
  4474. boolean ret = false;
  4475. if (0 < cmin) {
  4476. matcher.locals[countIndex] = 1;
  4477. ret = body.match(matcher, i, seq);
  4478. } else if (0 < cmax) {
  4479. matcher.locals[countIndex] = 1;
  4480. ret = body.match(matcher, i, seq);
  4481. if (ret == false)
  4482. ret = next.match(matcher, i, seq);
  4483. } else {
  4484. ret = next.match(matcher, i, seq);
  4485. }
  4486. matcher.locals[countIndex] = save;
  4487. return ret;
  4488. }
  4489. boolean study(TreeInfo info) {
  4490. info.maxValid = false;
  4491. info.deterministic = false;
  4492. return false;
  4493. }
  4494. }
  4495. /**
  4496. * Handles the repetition count for a reluctant Curly. The matchInit
  4497. * is called from the Prolog to save the index of where the group
  4498. * beginning is stored. A zero length group check occurs in the
  4499. * normal match but is skipped in the matchInit.
  4500. */
  4501. static final class LazyLoop extends Loop {
  4502. LazyLoop(int countIndex, int beginIndex) {
  4503. super(countIndex, beginIndex);
  4504. }
  4505. boolean match(Matcher matcher, int i, CharSequence seq) {
  4506. // Check for zero length group
  4507. if (i > matcher.locals[beginIndex]) {
  4508. int count = matcher.locals[countIndex];
  4509. if (count < cmin) {
  4510. matcher.locals[countIndex] = count + 1;
  4511. boolean result = body.match(matcher, i, seq);
  4512. // If match failed we must backtrack, so
  4513. // the loop count should NOT be incremented
  4514. if (!result)
  4515. matcher.locals[countIndex] = count;
  4516. return result;
  4517. }
  4518. if (next.match(matcher, i, seq))
  4519. return true;
  4520. if (count < cmax) {
  4521. matcher.locals[countIndex] = count + 1;
  4522. boolean result = body.match(matcher, i, seq);
  4523. // If match failed we must backtrack, so
  4524. // the loop count should NOT be incremented
  4525. if (!result)
  4526. matcher.locals[countIndex] = count;
  4527. return result;
  4528. }
  4529. return false;
  4530. }
  4531. return next.match(matcher, i, seq);
  4532. }
  4533. boolean matchInit(Matcher matcher, int i, CharSequence seq) {
  4534. int save = matcher.locals[countIndex];
  4535. boolean ret = false;
  4536. if (0 < cmin) {
  4537. matcher.locals[countIndex] = 1;
  4538. ret = body.match(matcher, i, seq);
  4539. } else if (next.match(matcher, i, seq)) {
  4540. ret = true;
  4541. } else if (0 < cmax) {
  4542. matcher.locals[countIndex] = 1;
  4543. ret = body.match(matcher, i, seq);
  4544. }
  4545. matcher.locals[countIndex] = save;
  4546. return ret;
  4547. }
  4548. boolean study(TreeInfo info) {
  4549. info.maxValid = false;
  4550. info.deterministic = false;
  4551. return false;
  4552. }
  4553. }
  4554. /**
  4555. * Refers to a group in the regular expression. Attempts to match
  4556. * whatever the group referred to last matched.
  4557. */
  4558. static class BackRef extends Node {
  4559. int groupIndex;
  4560. BackRef(int groupCount) {
  4561. super();
  4562. groupIndex = groupCount + groupCount;
  4563. }
  4564. boolean match(Matcher matcher, int i, CharSequence seq) {
  4565. int j = matcher.groups[groupIndex];
  4566. int k = matcher.groups[groupIndex+1];
  4567. int groupSize = k - j;
  4568. // If the referenced group didn't match, neither can this
  4569. if (j < 0)
  4570. return false;
  4571. // If there isn't enough input left no match
  4572. if (i + groupSize > matcher.to) {
  4573. matcher.hitEnd = true;
  4574. return false;
  4575. }
  4576. // Check each new char to make sure it matches what the group
  4577. // referenced matched last time around
  4578. for (int index=0; index<groupSize; index++)
  4579. if (seq.charAt(i+index) != seq.charAt(j+index))
  4580. return false;
  4581. return next.match(matcher, i+groupSize, seq);
  4582. }
  4583. boolean study(TreeInfo info) {
  4584. info.maxValid = false;
  4585. return next.study(info);
  4586. }
  4587. }
  4588. static class CIBackRef extends Node {
  4589. int groupIndex;
  4590. CIBackRef(int groupCount) {
  4591. super();
  4592. groupIndex = groupCount + groupCount;
  4593. }
  4594. boolean match(Matcher matcher, int i, CharSequence seq) {
  4595. int j = matcher.groups[groupIndex];
  4596. int k = matcher.groups[groupIndex+1];
  4597. int groupSize = k - j;
  4598. // If the referenced group didn't match, neither can this
  4599. if (j < 0)
  4600. return false;
  4601. // If there isn't enough input left no match
  4602. if (i + groupSize > matcher.to) {
  4603. matcher.hitEnd = true;
  4604. return false;
  4605. }
  4606. // Check each new char to make sure it matches what the group
  4607. // referenced matched last time around
  4608. int x = i;
  4609. for (int index=0; index<groupSize; index++) {
  4610. int c1 = Character.codePointAt(seq, x);
  4611. int c2 = Character.codePointAt(seq, j);
  4612. if (c1 != c2) {
  4613. int cc1 = Character.toUpperCase(c1);
  4614. int cc2 = Character.toUpperCase(c2);
  4615. if (cc1 != cc2) {
  4616. cc1 = Character.toLowerCase(cc1);
  4617. cc2 = Character.toLowerCase(cc2);
  4618. if (cc1 != cc2)
  4619. return false;
  4620. }
  4621. }
  4622. x += Character.charCount(c1);
  4623. j += Character.charCount(c2);
  4624. }
  4625. return next.match(matcher, i+groupSize, seq);
  4626. }
  4627. boolean study(TreeInfo info) {
  4628. info.maxValid = false;
  4629. return next.study(info);
  4630. }
  4631. }
  4632. /**
  4633. * Searches until the next instance of its atom. This is useful for
  4634. * finding the atom efficiently without passing an instance of it
  4635. * (greedy problem) and without a lot of wasted search time (reluctant
  4636. * problem).
  4637. */
  4638. static final class First extends Node {
  4639. Node atom;
  4640. First(Node node) {
  4641. this.atom = BnM.optimize(node);
  4642. }
  4643. boolean match(Matcher matcher, int i, CharSequence seq) {
  4644. if (atom instanceof BnM) {
  4645. return atom.match(matcher, i, seq)
  4646. && next.match(matcher, matcher.last, seq);
  4647. }
  4648. for (;;) {
  4649. if (i > matcher.to) {
  4650. matcher.hitEnd = true;
  4651. return false;
  4652. }
  4653. if (atom.match(matcher, i, seq)) {
  4654. return next.match(matcher, matcher.last, seq);
  4655. }
  4656. i += countChars(seq, i, 1);
  4657. matcher.first++;
  4658. }
  4659. }
  4660. boolean study(TreeInfo info) {
  4661. atom.study(info);
  4662. info.maxValid = false;
  4663. info.deterministic = false;
  4664. return next.study(info);
  4665. }
  4666. }
  4667. static final class Conditional extends Node {
  4668. Node cond, yes, not;
  4669. Conditional(Node cond, Node yes, Node not) {
  4670. this.cond = cond;
  4671. this.yes = yes;
  4672. this.not = not;
  4673. }
  4674. boolean match(Matcher matcher, int i, CharSequence seq) {
  4675. if (cond.match(matcher, i, seq)) {
  4676. return yes.match(matcher, i, seq);
  4677. } else {
  4678. return not.match(matcher, i, seq);
  4679. }
  4680. }
  4681. boolean study(TreeInfo info) {
  4682. int minL = info.minLength;
  4683. int maxL = info.maxLength;
  4684. boolean maxV = info.maxValid;
  4685. info.reset();
  4686. yes.study(info);
  4687. int minL2 = info.minLength;
  4688. int maxL2 = info.maxLength;
  4689. boolean maxV2 = info.maxValid;
  4690. info.reset();
  4691. not.study(info);
  4692. info.minLength = minL + Math.min(minL2, info.minLength);
  4693. info.maxLength = maxL + Math.max(maxL2, info.maxLength);
  4694. info.maxValid = (maxV & maxV2 & info.maxValid);
  4695. info.deterministic = false;
  4696. return next.study(info);
  4697. }
  4698. }
  4699. /**
  4700. * Zero width positive lookahead.
  4701. */
  4702. static final class Pos extends Node {
  4703. Node cond;
  4704. Pos(Node cond) {
  4705. this.cond = cond;
  4706. }
  4707. boolean match(Matcher matcher, int i, CharSequence seq) {
  4708. int savedTo = matcher.to;
  4709. boolean conditionMatched = false;
  4710. // Relax transparent region boundaries for lookahead
  4711. if (matcher.transparentBounds)
  4712. matcher.to = matcher.getTextLength();
  4713. try {
  4714. conditionMatched = cond.match(matcher, i, seq);
  4715. } finally {
  4716. // Reinstate region boundaries
  4717. matcher.to = savedTo;
  4718. }
  4719. return conditionMatched && next.match(matcher, i, seq);
  4720. }
  4721. }
  4722. /**
  4723. * Zero width negative lookahead.
  4724. */
  4725. static final class Neg extends Node {
  4726. Node cond;
  4727. Neg(Node cond) {
  4728. this.cond = cond;
  4729. }
  4730. boolean match(Matcher matcher, int i, CharSequence seq) {
  4731. int savedTo = matcher.to;
  4732. boolean conditionMatched = false;
  4733. // Relax transparent region boundaries for lookahead
  4734. if (matcher.transparentBounds)
  4735. matcher.to = matcher.getTextLength();
  4736. try {
  4737. if (i < matcher.to) {
  4738. conditionMatched = !cond.match(matcher, i, seq);
  4739. } else {
  4740. // If a negative lookahead succeeds then more input
  4741. // could cause it to fail!
  4742. matcher.requireEnd = true;
  4743. conditionMatched = !cond.match(matcher, i, seq);
  4744. }
  4745. } finally {
  4746. // Reinstate region boundaries
  4747. matcher.to = savedTo;
  4748. }
  4749. return conditionMatched && next.match(matcher, i, seq);
  4750. }
  4751. }
  4752. /**
  4753. * Zero width positive lookbehind.
  4754. */
  4755. static class Behind extends Node {
  4756. Node cond;
  4757. int rmax, rmin;
  4758. Behind(Node cond, int rmax, int rmin) {
  4759. this.cond = cond;
  4760. this.rmax = rmax;
  4761. this.rmin = rmin;
  4762. }
  4763. boolean match(Matcher matcher, int i, CharSequence seq) {
  4764. int savedFrom = matcher.from;
  4765. boolean conditionMatched = false;
  4766. int startIndex = (!matcher.transparentBounds) ?
  4767. matcher.from : 0;
  4768. int from = Math.max(i - rmax, startIndex);
  4769. for (int j = i - rmin; j >= from; j--) {
  4770. // Relax transparent region boundaries for lookbehind
  4771. if (matcher.transparentBounds)
  4772. matcher.from = 0;
  4773. try {
  4774. conditionMatched =
  4775. (cond.match(matcher, j, seq) && matcher.last == i);
  4776. } finally { // Reinstate region boundaries
  4777. matcher.from = savedFrom;
  4778. }
  4779. if (conditionMatched)
  4780. return next.match(matcher, i, seq);
  4781. }
  4782. return false;
  4783. }
  4784. }
  4785. /**
  4786. * Zero width positive lookbehind, including supplementary
  4787. * characters or unpaired surrogates.
  4788. */
  4789. static final class BehindS extends Behind {
  4790. BehindS(Node cond, int rmax, int rmin) {
  4791. super(cond, rmax, rmin);
  4792. }
  4793. boolean match(Matcher matcher, int i, CharSequence seq) {
  4794. int rmaxChars = countChars(seq, i, -rmax);
  4795. int rminChars = countChars(seq, i, -rmin);
  4796. int savedFrom = matcher.from;
  4797. int startIndex = (!matcher.transparentBounds) ?
  4798. matcher.from : 0;
  4799. boolean conditionMatched = false;
  4800. int from = Math.max(i - rmaxChars, startIndex);
  4801. for (int j = i - rminChars;
  4802. j >= from;
  4803. j -= j>from ? countChars(seq, j, -1) : 1) {
  4804. // Relax transparent region boundaries for lookbehind
  4805. if (matcher.transparentBounds)
  4806. matcher.from = 0;
  4807. try {
  4808. conditionMatched =
  4809. (cond.match(matcher, j, seq) && matcher.last == i);
  4810. } finally { // Reinstate region boundaries
  4811. matcher.from = savedFrom;
  4812. }
  4813. if (conditionMatched)
  4814. return next.match(matcher, i, seq);
  4815. }
  4816. return false;
  4817. }
  4818. }
  4819. /**
  4820. * Zero width negative lookbehind.
  4821. */
  4822. static class NotBehind extends Node {
  4823. Node cond;
  4824. int rmax, rmin;
  4825. NotBehind(Node cond, int rmax, int rmin) {
  4826. this.cond = cond;
  4827. this.rmax = rmax;
  4828. this.rmin = rmin;
  4829. }
  4830. boolean match(Matcher matcher, int i, CharSequence seq) {
  4831. int savedFrom = matcher.from;
  4832. boolean conditionMatched = false;
  4833. int startIndex = (!matcher.transparentBounds) ?
  4834. matcher.from : 0;
  4835. int from = Math.max(i - rmax, startIndex);
  4836. for (int j = i - rmin; j >= from; j--) {
  4837. // Relax transparent region boundaries for lookbehind
  4838. if (matcher.transparentBounds)
  4839. matcher.from = 0;
  4840. try {
  4841. conditionMatched =
  4842. (cond.match(matcher, j, seq) && matcher.last == i);
  4843. } finally { // Reinstate region boundaries
  4844. matcher.from = savedFrom;
  4845. }
  4846. if (conditionMatched)
  4847. return false;
  4848. }
  4849. return next.match(matcher, i, seq);
  4850. }
  4851. }
  4852. /**
  4853. * Zero width negative lookbehind, including supplementary
  4854. * characters or unpaired surrogates.
  4855. */
  4856. static final class NotBehindS extends NotBehind {
  4857. NotBehindS(Node cond, int rmax, int rmin) {
  4858. super(cond, rmax, rmin);
  4859. }
  4860. boolean match(Matcher matcher, int i, CharSequence seq) {
  4861. int rmaxChars = countChars(seq, i, -rmax);
  4862. int rminChars = countChars(seq, i, -rmin);
  4863. int savedFrom = matcher.from;
  4864. boolean conditionMatched = false;
  4865. int startIndex = (!matcher.transparentBounds) ?
  4866. matcher.from : 0;
  4867. int from = Math.max(i - rmaxChars, startIndex);
  4868. for (int j = i - rminChars;
  4869. j >= from;
  4870. j -= j>from ? countChars(seq, j, -1) : 1) {
  4871. // Relax transparent region boundaries for lookbehind
  4872. if (matcher.transparentBounds)
  4873. matcher.from = 0;
  4874. try {
  4875. conditionMatched =
  4876. (cond.match(matcher, j, seq) && matcher.last == i);
  4877. } finally { // Reinstate region boundaries
  4878. matcher.from = savedFrom;
  4879. }
  4880. if (conditionMatched)
  4881. return false;
  4882. }
  4883. return next.match(matcher, i, seq);
  4884. }
  4885. }
  4886. /**
  4887. * An object added to the tree when a character class has an additional
  4888. * range added to it.
  4889. */
  4890. static class Add extends Node {
  4891. Node lhs, rhs;
  4892. Add(Node lhs, Node rhs) {
  4893. this.lhs = lhs;
  4894. this.rhs = rhs;
  4895. }
  4896. boolean match(Matcher matcher, int i, CharSequence seq) {
  4897. if (i < matcher.to) {
  4898. return ((lhs.match(matcher, i, seq) ||
  4899. rhs.match(matcher, i, seq)) &&
  4900. next.match(matcher, matcher.last, seq));
  4901. }
  4902. matcher.hitEnd = true;
  4903. return false;
  4904. }
  4905. boolean study(TreeInfo info) {
  4906. boolean maxV = info.maxValid;
  4907. boolean detm = info.deterministic;
  4908. int minL = info.minLength;
  4909. int maxL = info.maxLength;
  4910. lhs.study(info);
  4911. int minL2 = info.minLength;
  4912. int maxL2 = info.maxLength;
  4913. info.minLength = minL;
  4914. info.maxLength = maxL;
  4915. rhs.study(info);
  4916. info.minLength = Math.min(minL2, info.minLength);
  4917. info.maxLength = Math.max(maxL2, info.maxLength);
  4918. info.maxValid = maxV;
  4919. info.deterministic = detm;
  4920. return next.study(info);
  4921. }
  4922. }
  4923. /**
  4924. * An object added to the tree when a character class has another
  4925. * nested class in it.
  4926. */
  4927. static class Both extends Node {
  4928. Node lhs, rhs;
  4929. Both(Node lhs, Node rhs) {
  4930. this.lhs = lhs;
  4931. this.rhs = rhs;
  4932. }
  4933. boolean match(Matcher matcher, int i, CharSequence seq) {
  4934. if (i < matcher.to) {
  4935. return ((lhs.match(matcher, i, seq) &&
  4936. rhs.match(matcher, i, seq)) &&
  4937. next.match(matcher, matcher.last, seq));
  4938. }
  4939. matcher.hitEnd = true;
  4940. return false;
  4941. }
  4942. boolean study(TreeInfo info) {
  4943. boolean maxV = info.maxValid;
  4944. boolean detm = info.deterministic;
  4945. int minL = info.minLength;
  4946. int maxL = info.maxLength;
  4947. lhs.study(info);
  4948. int minL2 = info.minLength;
  4949. int maxL2 = info.maxLength;
  4950. info.minLength = minL;
  4951. info.maxLength = maxL;
  4952. rhs.study(info);
  4953. info.minLength = Math.min(minL2, info.minLength);
  4954. info.maxLength = Math.max(maxL2, info.maxLength);
  4955. info.maxValid = maxV;
  4956. info.deterministic = detm;
  4957. return next.study(info);
  4958. }
  4959. }
  4960. /**
  4961. * An object added to the tree when a character class has a range
  4962. * or single subtracted from it.
  4963. */
  4964. static final class Sub extends Add {
  4965. Sub(Node lhs, Node rhs) {
  4966. super(lhs, rhs);
  4967. }
  4968. boolean match(Matcher matcher, int i, CharSequence seq) {
  4969. if (i < matcher.to) {
  4970. return !rhs.match(matcher, i, seq)
  4971. && lhs.match(matcher, i, seq)
  4972. && next.match(matcher, matcher.last, seq);
  4973. }
  4974. matcher.hitEnd = true;
  4975. return false;
  4976. }
  4977. boolean study(TreeInfo info) {
  4978. lhs.study(info);
  4979. return next.study(info);
  4980. }
  4981. }
  4982. /**
  4983. * Handles word boundaries. Includes a field to allow this one class to
  4984. * deal with the different types of word boundaries we can match. The word
  4985. * characters include underscores, letters, and digits. Non spacing marks
  4986. * can are also part of a word if they have a base character, otherwise
  4987. * they are ignored for purposes of finding word boundaries.
  4988. */
  4989. static final class Bound extends Node {
  4990. static int LEFT = 0x1;
  4991. static int RIGHT= 0x2;
  4992. static int BOTH = 0x3;
  4993. static int NONE = 0x4;
  4994. int type;
  4995. Bound(int n) {
  4996. type = n;
  4997. }
  4998. int check(Matcher matcher, int i, CharSequence seq) {
  4999. int ch;
  5000. boolean left = false;
  5001. int startIndex = matcher.from;
  5002. int endIndex = matcher.to;
  5003. if (matcher.transparentBounds) {
  5004. startIndex = 0;
  5005. endIndex = matcher.getTextLength();
  5006. }
  5007. if (i > startIndex) {
  5008. ch = Character.codePointBefore(seq, i);
  5009. left = (ch == '_' || Character.isLetterOrDigit(ch) ||
  5010. ((Character.getType(ch) == Character.NON_SPACING_MARK)
  5011. && hasBaseCharacter(matcher, i-1, seq)));
  5012. }
  5013. boolean right = false;
  5014. if (i < endIndex) {
  5015. ch = Character.codePointAt(seq, i);
  5016. right = (ch == '_' || Character.isLetterOrDigit(ch) ||
  5017. ((Character.getType(ch) == Character.NON_SPACING_MARK)
  5018. && hasBaseCharacter(matcher, i, seq)));
  5019. } else {
  5020. // Tried to access char past the end
  5021. matcher.hitEnd = true;
  5022. // The addition of another char could wreck a boundary
  5023. matcher.requireEnd = true;
  5024. }
  5025. return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
  5026. }
  5027. boolean match(Matcher matcher, int i, CharSequence seq) {
  5028. return (check(matcher, i, seq) & type) > 0
  5029. && next.match(matcher, i, seq);
  5030. }
  5031. }
  5032. /**
  5033. * Non spacing marks only count as word characters in bounds calculations
  5034. * if they have a base character.
  5035. */
  5036. private static boolean hasBaseCharacter(Matcher matcher, int i,
  5037. CharSequence seq)
  5038. {
  5039. int start = (!matcher.transparentBounds) ?
  5040. matcher.from : 0;
  5041. for (int x=i; x >= start; x--) {
  5042. int ch = Character.codePointAt(seq, x);
  5043. if (Character.isLetterOrDigit(ch))
  5044. return true;
  5045. if (Character.getType(ch) == Character.NON_SPACING_MARK)
  5046. continue;
  5047. return false;
  5048. }
  5049. return false;
  5050. }
  5051. /**
  5052. * Attempts to match a slice in the input using the Boyer-Moore string
  5053. * matching algorithm. The algorithm is based on the idea that the
  5054. * pattern can be shifted farther ahead in the search text if it is
  5055. * matched right to left.
  5056. * <p>
  5057. * The pattern is compared to the input one character at a time, from
  5058. * the rightmost character in the pattern to the left. If the characters
  5059. * all match the pattern has been found. If a character does not match,
  5060. * the pattern is shifted right a distance that is the maximum of two
  5061. * functions, the bad character shift and the good suffix shift. This
  5062. * shift moves the attempted match position through the input more
  5063. * quickly than a naive one position at a time check.
  5064. * <p>
  5065. * The bad character shift is based on the character from the text that
  5066. * did not match. If the character does not appear in the pattern, the
  5067. * pattern can be shifted completely beyond the bad character. If the
  5068. * character does occur in the pattern, the pattern can be shifted to
  5069. * line the pattern up with the next occurrence of that character.
  5070. * <p>
  5071. * The good suffix shift is based on the idea that some subset on the right
  5072. * side of the pattern has matched. When a bad character is found, the
  5073. * pattern can be shifted right by the pattern length if the subset does
  5074. * not occur again in pattern, or by the amount of distance to the
  5075. * next occurrence of the subset in the pattern.
  5076. *
  5077. * Boyer-Moore search methods adapted from code by Amy Yu.
  5078. */
  5079. static class BnM extends Node {
  5080. int[] buffer;
  5081. int[] lastOcc;
  5082. int[] optoSft;
  5083. /**
  5084. * Pre calculates arrays needed to generate the bad character
  5085. * shift and the good suffix shift. Only the last seven bits
  5086. * are used to see if chars match; This keeps the tables small
  5087. * and covers the heavily used ASCII range, but occasionally
  5088. * results in an aliased match for the bad character shift.
  5089. */
  5090. static Node optimize(Node node) {
  5091. if (!(node instanceof Slice)) {
  5092. return node;
  5093. }
  5094. int[] src = ((Slice) node).buffer;
  5095. int patternLength = src.length;
  5096. // The BM algorithm requires a bit of overhead;
  5097. // If the pattern is short don't use it, since
  5098. // a shift larger than the pattern length cannot
  5099. // be used anyway.
  5100. if (patternLength < 4) {
  5101. return node;
  5102. }
  5103. int i, j, k;
  5104. int[] lastOcc = new int[128];
  5105. int[] optoSft = new int[patternLength];
  5106. // Precalculate part of the bad character shift
  5107. // It is a table for where in the pattern each
  5108. // lower 7-bit value occurs
  5109. for (i = 0; i < patternLength; i++) {
  5110. lastOcc[src[i]&0x7F] = i + 1;
  5111. }
  5112. // Precalculate the good suffix shift
  5113. // i is the shift amount being considered
  5114. NEXT: for (i = patternLength; i > 0; i--) {
  5115. // j is the beginning index of suffix being considered
  5116. for (j = patternLength - 1; j >= i; j--) {
  5117. // Testing for good suffix
  5118. if (src[j] == src[j-i]) {
  5119. // src[j..len] is a good suffix
  5120. optoSft[j-1] = i;
  5121. } else {
  5122. // No match. The array has already been
  5123. // filled up with correct values before.
  5124. continue NEXT;
  5125. }
  5126. }
  5127. // This fills up the remaining of optoSft
  5128. // any suffix can not have larger shift amount
  5129. // then its sub-suffix. Why???
  5130. while (j > 0) {
  5131. optoSft[--j] = i;
  5132. }
  5133. }
  5134. // Set the guard value because of unicode compression
  5135. optoSft[patternLength-1] = 1;
  5136. if (node instanceof SliceS)
  5137. return new BnMS(src, lastOcc, optoSft, node.next);
  5138. return new BnM(src, lastOcc, optoSft, node.next);
  5139. }
  5140. BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
  5141. this.buffer = src;
  5142. this.lastOcc = lastOcc;
  5143. this.optoSft = optoSft;
  5144. this.next = next;
  5145. }
  5146. boolean match(Matcher matcher, int i, CharSequence seq) {
  5147. int[] src = buffer;
  5148. int patternLength = src.length;
  5149. int last = matcher.to - patternLength;
  5150. // Loop over all possible match positions in text
  5151. NEXT: while (i <= last) {
  5152. // Loop over pattern from right to left
  5153. for (int j = patternLength - 1; j >= 0; j--) {
  5154. int ch = seq.charAt(i+j);
  5155. if (ch != src[j]) {
  5156. // Shift search to the right by the maximum of the
  5157. // bad character shift and the good suffix shift
  5158. i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
  5159. continue NEXT;
  5160. }
  5161. }
  5162. // Entire pattern matched starting at i
  5163. matcher.first = i;
  5164. boolean ret = next.match(matcher, i + patternLength, seq);
  5165. if (ret) {
  5166. matcher.first = i;
  5167. matcher.groups[0] = matcher.first;
  5168. matcher.groups[1] = matcher.last;
  5169. return true;
  5170. }
  5171. i++;
  5172. }
  5173. // BnM is only used as the leading node in the unanchored case,
  5174. // and it replaced its Start() which always searches to the end
  5175. // if it doesn't find what it's looking for, so hitEnd is true.
  5176. matcher.hitEnd = true;
  5177. return false;
  5178. }
  5179. boolean study(TreeInfo info) {
  5180. info.minLength += buffer.length;
  5181. info.maxValid = false;
  5182. return next.study(info);
  5183. }
  5184. }
  5185. /**
  5186. * Supplementary support version of BnM(). Unpaired surrogates are
  5187. * also handled by this class.
  5188. */
  5189. static final class BnMS extends BnM {
  5190. int lengthInChars;
  5191. BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
  5192. super(src, lastOcc, optoSft, next);
  5193. for (int x = 0; x < buffer.length; x++) {
  5194. lengthInChars += Character.charCount(buffer[x]);
  5195. }
  5196. }
  5197. boolean match(Matcher matcher, int i, CharSequence seq) {
  5198. int[] src = buffer;
  5199. int patternLength = src.length;
  5200. int last = matcher.to - lengthInChars;
  5201. // Loop over all possible match positions in text
  5202. NEXT: while (i <= last) {
  5203. // Loop over pattern from right to left
  5204. int ch;
  5205. for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
  5206. j > 0; j -= Character.charCount(ch), x--) {
  5207. ch = Character.codePointBefore(seq, i+j);
  5208. if (ch != src[x]) {
  5209. // Shift search to the right by the maximum of the
  5210. // bad character shift and the good suffix shift
  5211. int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
  5212. i += countChars(seq, i, n);
  5213. continue NEXT;
  5214. }
  5215. }
  5216. // Entire pattern matched starting at i
  5217. matcher.first = i;
  5218. boolean ret = next.match(matcher, i + lengthInChars, seq);
  5219. if (ret) {
  5220. matcher.first = i;
  5221. matcher.groups[0] = matcher.first;
  5222. matcher.groups[1] = matcher.last;
  5223. return true;
  5224. }
  5225. i += countChars(seq, i, 1);
  5226. }
  5227. matcher.hitEnd = true;
  5228. return false;
  5229. }
  5230. }
  5231. /**
  5232. * Node class for matching characters in a Unicode block
  5233. */
  5234. static final class UBlock extends Node {
  5235. Character.UnicodeBlock block;
  5236. boolean complementMe = false;
  5237. UBlock() {
  5238. }
  5239. UBlock(Character.UnicodeBlock block, boolean not) {
  5240. this.block = block;
  5241. this.complementMe = not;
  5242. }
  5243. Node dup(boolean not) {
  5244. if (not)
  5245. return new UBlock(block, !complementMe);
  5246. else
  5247. return new UBlock(block, complementMe);
  5248. }
  5249. boolean match(Matcher matcher, int i, CharSequence seq) {
  5250. if (complementMe)
  5251. return notMatch(matcher, i, seq);
  5252. if (i < matcher.to) {
  5253. int ch = Character.codePointAt(seq, i);
  5254. return (block == Character.UnicodeBlock.of(ch) &&
  5255. (next.match(matcher, i+Character.charCount(ch), seq)));
  5256. }
  5257. matcher.hitEnd = true;
  5258. return false;
  5259. }
  5260. boolean notMatch(Matcher matcher, int i, CharSequence seq) {
  5261. if (i < matcher.to) {
  5262. int ch = Character.codePointAt(seq, i);
  5263. return (block != Character.UnicodeBlock.of(ch) &&
  5264. (next.match(matcher, i+Character.charCount(ch), seq)));
  5265. }
  5266. matcher.hitEnd = true;
  5267. return false;
  5268. }
  5269. boolean study(TreeInfo info) {
  5270. info.minLength++;
  5271. info.maxLength++;
  5272. return next.study(info);
  5273. }
  5274. }
  5275. ///////////////////////////////////////////////////////////////////////////////
  5276. ///////////////////////////////////////////////////////////////////////////////
  5277. /**
  5278. * This must be the very first initializer.
  5279. */
  5280. static Node accept = new Node();
  5281. static Node lastAccept = new LastNode();
  5282. static class categoryNames {
  5283. static HashMap cMap = new HashMap();
  5284. static {
  5285. cMap.put("Cn", new Category(1<<0)); // UNASSIGNED
  5286. cMap.put("Lu", new Category(1<<1)); // UPPERCASE_LETTER
  5287. cMap.put("Ll", new Category(1<<2)); // LOWERCASE_LETTER
  5288. cMap.put("Lt", new Category(1<<3)); // TITLECASE_LETTER
  5289. cMap.put("Lm", new Category(1<<4)); // MODIFIER_LETTER
  5290. cMap.put("Lo", new Category(1<<5)); // OTHER_LETTER
  5291. cMap.put("Mn", new Category(1<<6)); // NON_SPACING_MARK
  5292. cMap.put("Me", new Category(1<<7)); // ENCLOSING_MARK
  5293. cMap.put("Mc", new Category(1<<8)); // COMBINING_SPACING_MARK
  5294. cMap.put("Nd", new Category(1<<9)); // DECIMAL_DIGIT_NUMBER
  5295. cMap.put("Nl", new Category(1<<10)); // LETTER_NUMBER
  5296. cMap.put("No", new Category(1<<11)); // OTHER_NUMBER
  5297. cMap.put("Zs", new Category(1<<12)); // SPACE_SEPARATOR
  5298. cMap.put("Zl", new Category(1<<13)); // LINE_SEPARATOR
  5299. cMap.put("Zp", new Category(1<<14)); // PARAGRAPH_SEPARATOR
  5300. cMap.put("Cc", new Category(1<<15)); // CNTRL
  5301. cMap.put("Cf", new Category(1<<16)); // FORMAT
  5302. cMap.put("Co", new Category(1<<18)); // PRIVATE USE
  5303. cMap.put("Cs", new Category(1<<19)); // SURROGATE
  5304. cMap.put("Pd", new Category(1<<20)); // DASH_PUNCTUATION
  5305. cMap.put("Ps", new Category(1<<21)); // START_PUNCTUATION
  5306. cMap.put("Pe", new Category(1<<22)); // END_PUNCTUATION
  5307. cMap.put("Pc", new Category(1<<23)); // CONNECTOR_PUNCTUATION
  5308. cMap.put("Po", new Category(1<<24)); // OTHER_PUNCTUATION
  5309. cMap.put("Sm", new Category(1<<25)); // MATH_SYMBOL
  5310. cMap.put("Sc", new Category(1<<26)); // CURRENCY_SYMBOL
  5311. cMap.put("Sk", new Category(1<<27)); // MODIFIER_SYMBOL
  5312. cMap.put("So", new Category(1<<28)); // OTHER_SYMBOL
  5313. cMap.put("L", new Category(0x0000003E)); // LETTER
  5314. cMap.put("M", new Category(0x000001C0)); // MARK
  5315. cMap.put("N", new Category(0x00000E00)); // NUMBER
  5316. cMap.put("Z", new Category(0x00007000)); // SEPARATOR
  5317. cMap.put("C", new Category(0x000D8000)); // CONTROL
  5318. cMap.put("P", new Category(0x01F00000)); // PUNCTUATION
  5319. cMap.put("S", new Category(0x1E000000)); // SYMBOL
  5320. cMap.put("LD", new Category(0x0000023E)); // LETTER_OR_DIGIT
  5321. cMap.put("L1", new Range(0x000000FF)); // Latin-1
  5322. cMap.put("all", new All()); // ALL
  5323. cMap.put("ASCII", new Range(0x0000007F)); // ASCII
  5324. cMap.put("Alnum", new Ctype(ASCII.ALNUM)); // Alphanumeric characters
  5325. cMap.put("Alpha", new Ctype(ASCII.ALPHA)); // Alphabetic characters
  5326. cMap.put("Blank", new Ctype(ASCII.BLANK)); // Space and tab characters
  5327. cMap.put("Cntrl", new Ctype(ASCII.CNTRL)); // Control characters
  5328. cMap.put("Digit", new Range(('0'<<16)|'9')); // Numeric characters
  5329. cMap.put("Graph", new Ctype(ASCII.GRAPH)); // printable and visible
  5330. cMap.put("Lower", new Range(('a'<<16)|'z')); // Lower-case alphabetic
  5331. cMap.put("Print", new Range(0x0020007E)); // Printable characters
  5332. cMap.put("Punct", new Ctype(ASCII.PUNCT)); // Punctuation characters
  5333. cMap.put("Space", new Ctype(ASCII.SPACE)); // Space characters
  5334. cMap.put("Upper", new Range(('A'<<16)|'Z')); // Upper-case alphabetic
  5335. cMap.put("XDigit", new Ctype(ASCII.XDIGIT)); // hexadecimal digits
  5336. cMap.put("javaLowerCase", new JavaLowerCase());
  5337. cMap.put("javaUpperCase", new JavaUpperCase());
  5338. cMap.put("javaTitleCase", new JavaTitleCase());
  5339. cMap.put("javaDigit", new JavaDigit());
  5340. cMap.put("javaDefined", new JavaDefined());
  5341. cMap.put("javaLetter", new JavaLetter());
  5342. cMap.put("javaLetterOrDigit", new JavaLetterOrDigit());
  5343. cMap.put("javaJavaIdentifierStart", new JavaJavaIdentifierStart());
  5344. cMap.put("javaJavaIdentifierPart", new JavaJavaIdentifierPart());
  5345. cMap.put("javaUnicodeIdentifierStart", new JavaUnicodeIdentifierStart());
  5346. cMap.put("javaUnicodeIdentifierPart", new JavaUnicodeIdentifierPart());
  5347. cMap.put("javaIdentifierIgnorable", new JavaIdentifierIgnorable());
  5348. cMap.put("javaSpaceChar", new JavaSpaceChar());
  5349. cMap.put("javaWhitespace", new JavaWhitespace());
  5350. cMap.put("javaISOControl", new JavaISOControl());
  5351. cMap.put("javaMirrored", new JavaMirrored());
  5352. }
  5353. }
  5354. }