1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  58. import java.text.CharacterIterator;
  59. /**
  60. * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
  61. * This engine does not conform to the POSIX regular expression.
  62. *
  63. * <hr width="50%">
  64. * <h3>How to use</h3>
  65. *
  66. * <dl>
  67. * <dt>A. Standard way
  68. * <dd>
  69. * <pre>
  70. * RegularExpression re = new RegularExpression(<var>regex</var>);
  71. * if (re.matches(text)) { ... }
  72. * </pre>
  73. *
  74. * <dt>B. Capturing groups
  75. * <dd>
  76. * <pre>
  77. * RegularExpression re = new RegularExpression(<var>regex</var>);
  78. * Match match = new Match();
  79. * if (re.matches(text, match)) {
  80. * ... // You can refer captured texts with methods of the <code>Match</code> class.
  81. * }
  82. * </pre>
  83. *
  84. * </dl>
  85. *
  86. * <h4>Case-insensitive matching</h4>
  87. * <pre>
  88. * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
  89. * if (re.matches(text) >= 0) { ...}
  90. * </pre>
  91. *
  92. * <h4>Options</h4>
  93. * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
  94. * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
  95. * This <var>options</var> parameter consists of the following characters.
  96. * </p>
  97. * <dl>
  98. * <dt><a name="I_OPTION"><code>"i"</code></a>
  99. * <dd>This option indicates case-insensitive matching.
  100. * <dt><a name="M_OPTION"><code>"m"</code></a>
  101. * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
  102. * <dt><a name="S_OPTION"><code>"s"</code></a>
  103. * <dd class="REGEX"><kbd>.</kbd> matches any one character.
  104. * <dt><a name="U_OPTION"><code>"u"</code></a>
  105. * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode.
  106. * <dt><a name="W_OPTION"><code>"w"</code></a>
  107. * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of
  108. * 'Unicode Regular Expression Guidelines' Revision 4.
  109. * When "w" and "u" are specified at the same time,
  110. * <kbd>\b \B \< \></kbd> are processed for the "w" option.
  111. * <dt><a name="COMMA_OPTION"><code>","</code></a>
  112. * <dd>The parser treats a comma in a character class as a range separator.
  113. * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
  114. * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
  115. *
  116. * <dt><a name="X_OPTION"><code>"X"</code></a>
  117. * <dd class="REGEX">
  118. * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
  119. * The <code>match()</code> method does not do subsring matching
  120. * but entire string matching.
  121. *
  122. * </dl>
  123. *
  124. * <hr width="50%">
  125. * <h3>Syntax</h3>
  126. * <table border="1" bgcolor="#ddeeff">
  127. * <tr>
  128. * <td>
  129. * <h4>Differences from the Perl 5 regular expression</h4>
  130. * <ul>
  131. * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
  132. * <li>Supports subtraction, union, and intersection operations for character classes.
  133. * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
  134. * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
  135. * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
  136. * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
  137. * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
  138. * </ul>
  139. * </td>
  140. * </tr>
  141. * </table>
  142. *
  143. * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
  144. * <ul>
  145. * <li>Character
  146. * <dl>
  147. * <dt class="REGEX"><kbd>.</kbd> (A period)
  148. * <dd>Matches any one character except the following characters.
  149. * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
  150. * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
  151. * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
  152. * <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
  153. * it matches any character including the above four characters.
  154. *
  155. * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
  156. * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
  157. * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
  158. *
  159. * <dt class="REGEX"><kbd>\c</kbd><var>C</var>
  160. * <dd>Matches a control character.
  161. * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
  162. * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
  163. * It matches a control character of which the character code is less than
  164. * the character code of the <var>C</var> by 0x0040.
  165. * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
  166. * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
  167. *
  168. * <dt class="REGEX">a non-meta character
  169. * <dd>Matches the character.
  170. *
  171. * <dt class="REGEX"><KBD>\</KBD> + a meta character
  172. * <dd>Matches the meta character.
  173. *
  174. * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
  175. * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
  176. * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
  177. * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
  178. *
  179. * <!--
  180. * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
  181. * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
  182. * -->
  183. *
  184. * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
  185. * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
  186. *
  187. * <dt class="REGEX"><kbd>\g</kbd>
  188. * <dd>Matches a grapheme.
  189. * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
  190. *
  191. * <dt class="REGEX"><kbd>\X</kbd>
  192. * <dd class="REGEX">Matches a combining character sequence.
  193. * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
  194. * </dl>
  195. * </li>
  196. *
  197. * <li>Character class
  198. * <dl>
  199. + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
  200. + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
  201. * <dd>Positive character class. It matches a character in ranges.
  202. * <dd><var>R<sub>n</sub></var>:
  203. * <ul>
  204. * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
  205. * <p>This range matches the character.
  206. * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
  207. * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
  208. + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
  209. + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
  210. * <p>...
  211. * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
  212. * <p>These expressions specifies the same ranges as the following expressions.
  213. * </ul>
  214. * <p class="REGEX">Enumerated ranges are merged (union operation).
  215. * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
  216. *
  217. * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
  218. * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
  219. * <dd>Negative character class. It matches a character not in ranges.
  220. *
  221. * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
  222. * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
  223. * <dd>Subtraction or union or intersection for character classes.
  224. * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
  225. * <dd>The result of this operations is a <u>positive character class</u>
  226. * even if an expression includes any negative character classes.
  227. * You have to take care on this in case-insensitive matching.
  228. * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
  229. * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
  230. * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
  231. * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
  232. * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
  233. *
  234. * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
  235. * <dd>Character class subtraction for the XML Schema.
  236. * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
  237. *
  238. * <dt class="REGEX"><kbd>\d</kbd>
  239. * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
  240. * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  241. * <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
  242. *
  243. * <dt class="REGEX"><kbd>\D</kbd>
  244. * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
  245. * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  246. * <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
  247. *
  248. * <dt class="REGEX"><kbd>\s</kbd>
  249. * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
  250. * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  251. * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
  252. *
  253. * <dt class="REGEX"><kbd>\S</kbd>
  254. * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
  255. * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  256. * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
  257. *
  258. * <dt class="REGEX"><kbd>\w</kbd>
  259. * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
  260. * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  261. * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
  262. *
  263. * <dt class="REGEX"><kbd>\W</kbd>
  264. * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
  265. * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  266. * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
  267. *
  268. * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
  269. * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
  270. * The following names are available:
  271. * <dl>
  272. * <dt>Unicode General Categories:
  273. * <dd><kbd>
  274. * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
  275. * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
  276. * </kbd>
  277. * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
  278. * <dt>Unicode Blocks:
  279. * <dd><kbd>
  280. * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
  281. * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
  282. * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
  283. * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
  284. * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
  285. * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
  286. * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
  287. * Miscellaneous Technical, Control Pictures, Optical Character Recognition,
  288. * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
  289. * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
  290. * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
  291. * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
  292. * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
  293. * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
  294. * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
  295. * Small Form Variants, Arabic Presentation Forms-B, Specials,
  296. * Halfwidth and Fullwidth Forms
  297. * </kbd>
  298. * <dt>Others:
  299. * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
  300. * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
  301. * <dd><kbd>UNASSGINED</kbd>
  302. * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
  303. * </dl>
  304. *
  305. * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
  306. * <dd>Matches one character not in the specified General Category or the specified Block.
  307. * </dl>
  308. * </li>
  309. *
  310. * <li>Selection and Quantifier
  311. * <dl>
  312. * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
  313. * <dd>...
  314. *
  315. * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
  316. * <dd>Matches 0 or more <var>X</var>.
  317. *
  318. * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
  319. * <dd>Matches 1 or more <var>X</var>.
  320. *
  321. * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
  322. * <dd>Matches 0 or 1 <var>X</var>.
  323. *
  324. * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
  325. * <dd>Matches <var>number</var> times.
  326. *
  327. * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
  328. * <dd>...
  329. *
  330. * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
  331. * <dd>...
  332. *
  333. * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
  334. * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
  335. * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
  336. * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
  337. * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
  338. * <dd>Non-greedy matching.
  339. * </dl>
  340. * </li>
  341. *
  342. * <li>Grouping, Capturing, and Back-reference
  343. * <dl>
  344. * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
  345. * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
  346. * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
  347. * you have to write "<KBD>(?:foo)+</KBD>".
  348. *
  349. * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
  350. * <dd>Grouping with capturing.
  351. * It make a group and applications can know
  352. * where in target text a group matched with methods of a <code>Match</code> instance
  353. * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>.
  354. * The 0th group means whole of this regular expression.
  355. * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
  356. *
  357. * <p>For instance, a regular expression is
  358. * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>"
  359. * and target text is
  360. * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>":
  361. * <ul>
  362. * <li><code>Match.getCapturedText(0)</code>:
  363. * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>"
  364. * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
  365. * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
  366. * </ul>
  367. *
  368. * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
  369. * <dd>
  370. *
  371. * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
  372. * <dd>Independent expression group. ................
  373. *
  374. * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
  375. * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
  376. * <dd>............................
  377. * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
  378. * Note that it can not contain 'u'.
  379. *
  380. * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
  381. * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
  382. * <dd>......
  383. * <dd>These expressions must be at the beginning of a group.
  384. * </dl>
  385. * </li>
  386. *
  387. * <li>Anchor
  388. * <dl>
  389. * <dt class="REGEX"><kbd>\A</kbd>
  390. * <dd>Matches the beginnig of the text.
  391. *
  392. * <dt class="REGEX"><kbd>\Z</kbd>
  393. * <dd>Matches the end of the text, or before an EOL character at the end of the text,
  394. * or CARRIAGE RETURN + LINE FEED at the end of the text.
  395. *
  396. * <dt class="REGEX"><kbd>\z</kbd>
  397. * <dd>Matches the end of the text.
  398. *
  399. * <dt class="REGEX"><kbd>^</kbd>
  400. * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
  401. * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
  402. * it matches the beginning of the text, or after one of EOL characters (
  403. * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
  404. * PARAGRAPH SEPARATOR (U+2029).)
  405. *
  406. * <dt class="REGEX"><kbd>$</kbd>
  407. * <dd>Matches the end of the text, or before an EOL character at the end of the text,
  408. * or CARRIAGE RETURN + LINE FEED at the end of the text.
  409. * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
  410. * it matches the end of the text, or before an EOL character.
  411. *
  412. * <dt class="REGEX"><kbd>\b</kbd>
  413. * <dd>Matches word boundary.
  414. * (See <a href="#W_OPTION">a "w" option</a>)
  415. *
  416. * <dt class="REGEX"><kbd>\B</kbd>
  417. * <dd>Matches non word boundary.
  418. * (See <a href="#W_OPTION">a "w" option</a>)
  419. *
  420. * <dt class="REGEX"><kbd>\<</kbd>
  421. * <dd>Matches the beginning of a word.
  422. * (See <a href="#W_OPTION">a "w" option</a>)
  423. *
  424. * <dt class="REGEX"><kbd>\></kbd>
  425. * <dd>Matches the end of a word.
  426. * (See <a href="#W_OPTION">a "w" option</a>)
  427. * </dl>
  428. * </li>
  429. * <li>Lookahead and lookbehind
  430. * <dl>
  431. * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
  432. * <dd>Lookahead.
  433. *
  434. * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
  435. * <dd>Negative lookahead.
  436. *
  437. * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd>
  438. * <dd>Lookbehind.
  439. * <dd>(Note for text capturing......)
  440. *
  441. * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd>
  442. * <dd>Negative lookbehind.
  443. * </dl>
  444. * </li>
  445. *
  446. * <li>Misc.
  447. * <dl>
  448. * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
  449. * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
  450. * <dd>......
  451. * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
  452. * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'.
  453. * You can not write comments in character classes and before quantifiers.
  454. * </dl>
  455. * </li>
  456. * </ul>
  457. *
  458. *
  459. * <hr width="50%">
  460. * <h3>BNF for the regular expression</h3>
  461. * <pre>
  462. * regex ::= ('(?' options ')')? term ('|' term)*
  463. * term ::= factor+
  464. * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
  465. * | '(?#' [^)]* ')'
  466. * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
  467. * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  468. * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
  469. * | '(?>' regex ')' | '(?' options ':' regex ')'
  470. * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
  471. * options ::= [imsw]* ('-' [imsw]+)?
  472. * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  473. * looks ::= '(?=' regex ')' | '(?!' regex ')'
  474. * | '(?<=' regex ')' | '(?<!' regex ')'
  475. * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
  476. * category-block ::= '\' [pP] category-symbol-1
  477. * | ('\p{' | '\P{') (category-symbol | block-name
  478. * | other-properties) '}'
  479. * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
  480. * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
  481. * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
  482. * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
  483. * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
  484. * | 'Sm' | 'Sc' | 'Sk' | 'So'
  485. * block-name ::= (See above)
  486. * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
  487. * character-1 ::= (any character except meta-characters)
  488. *
  489. * char-class ::= '[' ranges ']'
  490. * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
  491. * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
  492. * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
  493. * | range-char | range-char '-' range-char
  494. * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
  495. * code-point ::= '\x' hex-char hex-char
  496. * | '\x{' hex-char+ '}'
  497. * <!-- | '\u005c u' hex-char hex-char hex-char hex-char
  498. * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
  499. * hex-char ::= [0-9a-fA-F]
  500. * character-2 ::= (any character except \[]-,)
  501. * </pre>
  502. *
  503. * <hr width="50%">
  504. * <h3>TODO</h3>
  505. * <ul>
  506. * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
  507. * <ul>
  508. * <li>2.4 Canonical Equivalents
  509. * <li>Level 3
  510. * </ul>
  511. * <li>Parsing performance
  512. * </ul>
  513. *
  514. * <hr width="50%">
  515. *
  516. * @author TAMURA Kent <kent@trl.ibm.co.jp>
  517. * @version $Id: RegularExpression.java,v 1.7 2003/07/15 12:28:25 neeraj Exp $
  518. */
  519. public class RegularExpression implements java.io.Serializable {
  520. static final boolean DEBUG = false;
  521. /**
  522. * Compiles a token tree into an operation flow.
  523. */
  524. private synchronized void compile(Token tok) {
  525. if (this.operations != null)
  526. return;
  527. this.numberOfClosures = 0;
  528. this.operations = this.compile(tok, null, false);
  529. }
  530. /**
  531. * Converts a token to an operation.
  532. */
  533. private Op compile(Token tok, Op next, boolean reverse) {
  534. Op ret;
  535. switch (tok.type) {
  536. case Token.DOT:
  537. ret = Op.createDot();
  538. ret.next = next;
  539. break;
  540. case Token.CHAR:
  541. ret = Op.createChar(tok.getChar());
  542. ret.next = next;
  543. break;
  544. case Token.ANCHOR:
  545. ret = Op.createAnchor(tok.getChar());
  546. ret.next = next;
  547. break;
  548. case Token.RANGE:
  549. case Token.NRANGE:
  550. ret = Op.createRange(tok);
  551. ret.next = next;
  552. break;
  553. case Token.CONCAT:
  554. ret = next;
  555. if (!reverse) {
  556. for (int i = tok.size()-1; i >= 0; i --) {
  557. ret = compile(tok.getChild(i), ret, false);
  558. }
  559. } else {
  560. for (int i = 0; i < tok.size(); i ++) {
  561. ret = compile(tok.getChild(i), ret, true);
  562. }
  563. }
  564. break;
  565. case Token.UNION:
  566. Op.UnionOp uni = Op.createUnion(tok.size());
  567. for (int i = 0; i < tok.size(); i ++) {
  568. uni.addElement(compile(tok.getChild(i), next, reverse));
  569. }
  570. ret = uni; // ret.next is null.
  571. break;
  572. case Token.CLOSURE:
  573. case Token.NONGREEDYCLOSURE:
  574. Token child = tok.getChild(0);
  575. int min = tok.getMin();
  576. int max = tok.getMax();
  577. if (min >= 0 && min == max) { // {n}
  578. ret = next;
  579. for (int i = 0; i < min; i ++) {
  580. ret = compile(child, ret, reverse);
  581. }
  582. break;
  583. }
  584. if (min > 0 && max > 0)
  585. max -= min;
  586. if (max > 0) {
  587. // X{2,6} -> XX(X(X(XX?)?)?)?
  588. ret = next;
  589. for (int i = 0; i < max; i ++) {
  590. Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
  591. q.next = next;
  592. q.setChild(compile(child, ret, reverse));
  593. ret = q;
  594. }
  595. } else {
  596. Op.ChildOp op;
  597. if (tok.type == Token.NONGREEDYCLOSURE) {
  598. op = Op.createNonGreedyClosure();
  599. } else { // Token.CLOSURE
  600. if (child.getMinLength() == 0)
  601. op = Op.createClosure(this.numberOfClosures++);
  602. else
  603. op = Op.createClosure(-1);
  604. }
  605. op.next = next;
  606. op.setChild(compile(child, op, reverse));
  607. ret = op;
  608. }
  609. if (min > 0) {
  610. for (int i = 0; i < min; i ++) {
  611. ret = compile(child, ret, reverse);
  612. }
  613. }
  614. break;
  615. case Token.EMPTY:
  616. ret = next;
  617. break;
  618. case Token.STRING:
  619. ret = Op.createString(tok.getString());
  620. ret.next = next;
  621. break;
  622. case Token.BACKREFERENCE:
  623. ret = Op.createBackReference(tok.getReferenceNumber());
  624. ret.next = next;
  625. break;
  626. case Token.PAREN:
  627. if (tok.getParenNumber() == 0) {
  628. ret = compile(tok.getChild(0), next, reverse);
  629. } else if (reverse) {
  630. next = Op.createCapture(tok.getParenNumber(), next);
  631. next = compile(tok.getChild(0), next, reverse);
  632. ret = Op.createCapture(-tok.getParenNumber(), next);
  633. } else {
  634. next = Op.createCapture(-tok.getParenNumber(), next);
  635. next = compile(tok.getChild(0), next, reverse);
  636. ret = Op.createCapture(tok.getParenNumber(), next);
  637. }
  638. break;
  639. case Token.LOOKAHEAD:
  640. ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
  641. break;
  642. case Token.NEGATIVELOOKAHEAD:
  643. ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
  644. break;
  645. case Token.LOOKBEHIND:
  646. ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
  647. break;
  648. case Token.NEGATIVELOOKBEHIND:
  649. ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
  650. break;
  651. case Token.INDEPENDENT:
  652. ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
  653. break;
  654. case Token.MODIFIERGROUP:
  655. ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
  656. ((Token.ModifierToken)tok).getOptions(),
  657. ((Token.ModifierToken)tok).getOptionsMask());
  658. break;
  659. case Token.CONDITION:
  660. Token.ConditionToken ctok = (Token.ConditionToken)tok;
  661. int ref = ctok.refNumber;
  662. Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
  663. Op yes = compile(ctok.yes, next, reverse);
  664. Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
  665. ret = Op.createCondition(next, ref, condition, yes, no);
  666. break;
  667. default:
  668. throw new RuntimeException("Unknown token type: "+tok.type);
  669. } // switch (tok.type)
  670. return ret;
  671. }
  672. //Public
  673. /**
  674. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  675. *
  676. * @return true if the target is matched to this regular expression.
  677. */
  678. public boolean matches(char[] target) {
  679. return this.matches(target, 0, target .length , (Match)null);
  680. }
  681. /**
  682. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  683. * in specified range or not.
  684. *
  685. * @param start Start offset of the range.
  686. * @param end End offset +1 of the range.
  687. * @return true if the target is matched to this regular expression.
  688. */
  689. public boolean matches(char[] target, int start, int end) {
  690. return this.matches(target, start, end, (Match)null);
  691. }
  692. /**
  693. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  694. *
  695. * @param match A Match instance for storing matching result.
  696. * @return Offset of the start position in <VAR>target</VAR> or -1 if not match.
  697. */
  698. public boolean matches(char[] target, Match match) {
  699. return this.matches(target, 0, target .length , match);
  700. }
  701. /**
  702. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  703. * in specified range or not.
  704. *
  705. * @param start Start offset of the range.
  706. * @param end End offset +1 of the range.
  707. * @param match A Match instance for storing matching result.
  708. * @return Offset of the start position in <VAR>target</VAR> or -1 if not match.
  709. */
  710. public boolean matches(char[] target, int start, int end, Match match) {
  711. synchronized (this) {
  712. if (this.operations == null)
  713. this.prepare();
  714. if (this.context == null)
  715. this.context = new Context();
  716. }
  717. Context con = null;
  718. synchronized (this.context) {
  719. con = this.context.inuse ? new Context() : this.context;
  720. con.reset(target, start, end, this.numberOfClosures);
  721. }
  722. if (match != null) {
  723. match.setNumberOfGroups(this.nofparen);
  724. match.setSource(target);
  725. } else if (this.hasBackReferences) {
  726. match = new Match();
  727. match.setNumberOfGroups(this.nofparen);
  728. // Need not to call setSource() because
  729. // a caller can not access this match instance.
  730. }
  731. con.match = match;
  732. if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
  733. int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  734. //System.err.println("DEBUG: matchEnd="+matchEnd);
  735. if (matchEnd == con.limit) {
  736. if (con.match != null) {
  737. con.match.setBeginning(0, con.start);
  738. con.match.setEnd(0, matchEnd);
  739. }
  740. con.inuse = false;
  741. return true;
  742. }
  743. return false;
  744. }
  745. /*
  746. * The pattern has only fixed string.
  747. * The engine uses Boyer-Moore.
  748. */
  749. if (this.fixedStringOnly) {
  750. //System.err.println("DEBUG: fixed-only: "+this.fixedString);
  751. int o = this.fixedStringTable.matches(target, con.start, con.limit);
  752. if (o >= 0) {
  753. if (con.match != null) {
  754. con.match.setBeginning(0, o);
  755. con.match.setEnd(0, o+this.fixedString.length());
  756. }
  757. con.inuse = false;
  758. return true;
  759. }
  760. con.inuse = false;
  761. return false;
  762. }
  763. /*
  764. * The pattern contains a fixed string.
  765. * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
  766. * If not, it return with false.
  767. */
  768. if (this.fixedString != null) {
  769. int o = this.fixedStringTable.matches(target, con.start, con.limit);
  770. if (o < 0) {
  771. //System.err.println("Non-match in fixed-string search.");
  772. con.inuse = false;
  773. return false;
  774. }
  775. }
  776. int limit = con.limit-this.minlength;
  777. int matchStart;
  778. int matchEnd = -1;
  779. /*
  780. * Checks whether the expression starts with ".*".
  781. */
  782. if (this.operations != null
  783. && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  784. if (isSet(this.options, SINGLE_LINE)) {
  785. matchStart = con.start;
  786. matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  787. } else {
  788. boolean previousIsEOL = true;
  789. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  790. int ch = target [ matchStart ] ;
  791. if (isEOLChar(ch)) {
  792. previousIsEOL = true;
  793. } else {
  794. if (previousIsEOL) {
  795. if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  796. matchStart, 1, this.options)))
  797. break;
  798. }
  799. previousIsEOL = false;
  800. }
  801. }
  802. }
  803. }
  804. /*
  805. * Optimization against the first character.
  806. */
  807. else if (this.firstChar != null) {
  808. //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
  809. RangeToken range = this.firstChar;
  810. if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
  811. range = this.firstChar.getCaseInsensitiveToken();
  812. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  813. int ch = target [ matchStart ] ;
  814. if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
  815. ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
  816. if (!range.match(ch)) continue;
  817. } else {
  818. if (!range.match(ch)) {
  819. char ch1 = Character.toUpperCase((char)ch);
  820. if (!range.match(ch1))
  821. if (!range.match(Character.toLowerCase(ch1)))
  822. continue;
  823. }
  824. }
  825. if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  826. matchStart, 1, this.options)))
  827. break;
  828. }
  829. } else {
  830. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  831. int ch = target [ matchStart ] ;
  832. if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
  833. ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] );
  834. if (!range.match(ch)) continue;
  835. if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  836. matchStart, 1, this.options)))
  837. break;
  838. }
  839. }
  840. }
  841. /*
  842. * Straightforward matching.
  843. */
  844. else {
  845. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  846. if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
  847. break;
  848. }
  849. }
  850. if (matchEnd >= 0) {
  851. if (con.match != null) {
  852. con.match.setBeginning(0, matchStart);
  853. con.match.setEnd(0, matchEnd);
  854. }
  855. con.inuse = false;
  856. return true;
  857. } else {
  858. con.inuse = false;
  859. return false;
  860. }
  861. }
  862. /**
  863. * @return -1 when not match; offset of the end of matched string when match.
  864. */
  865. private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
  866. char[] target = con.charTarget;
  867. while (true) {
  868. if (op == null)
  869. return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
  870. if (offset > con.limit || offset < con.start)
  871. return -1;
  872. switch (op.type) {
  873. case Op.CHAR:
  874. if (isSet(opts, IGNORE_CASE)) {
  875. int ch = op.getData();
  876. if (dx > 0) {
  877. if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] ))
  878. return -1;
  879. offset ++;
  880. } else {
  881. int o1 = offset-1;
  882. if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] ))
  883. return -1;
  884. offset = o1;
  885. }
  886. } else {
  887. int ch = op.getData();
  888. if (dx > 0) {
  889. if (offset >= con.limit || ch != target [ offset ] )
  890. return -1;
  891. offset ++;
  892. } else {
  893. int o1 = offset-1;
  894. if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] )
  895. return -1;
  896. offset = o1;
  897. }
  898. }
  899. op = op.next;
  900. break;
  901. case Op.DOT:
  902. if (dx > 0) {
  903. if (offset >= con.limit)
  904. return -1;
  905. int ch = target [ offset ] ;
  906. if (isSet(opts, SINGLE_LINE)) {
  907. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  908. offset ++;
  909. } else {
  910. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  911. ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
  912. if (isEOLChar(ch))
  913. return -1;
  914. }
  915. offset ++;
  916. } else {
  917. int o1 = offset-1;
  918. if (o1 >= con.limit || o1 < 0)
  919. return -1;
  920. int ch = target [ o1 ] ;
  921. if (isSet(opts, SINGLE_LINE)) {
  922. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  923. o1 --;
  924. } else {
  925. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  926. ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
  927. if (!isEOLChar(ch))
  928. return -1;
  929. }
  930. offset = o1;
  931. }
  932. op = op.next;
  933. break;
  934. case Op.RANGE:
  935. case Op.NRANGE:
  936. if (dx > 0) {
  937. if (offset >= con.limit)
  938. return -1;
  939. int ch = target [ offset ] ;
  940. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  941. ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] );
  942. RangeToken tok = op.getToken();
  943. if (isSet(opts, IGNORE_CASE)) {
  944. tok = tok.getCaseInsensitiveToken();
  945. if (!tok.match(ch)) {
  946. if (ch >= 0x10000) return -1;
  947. char uch;
  948. if (!tok.match(uch = Character.toUpperCase((char)ch))
  949. && !tok.match(Character.toLowerCase(uch)))
  950. return -1;
  951. }
  952. } else {
  953. if (!tok.match(ch)) return -1;
  954. }
  955. offset ++;
  956. } else {
  957. int o1 = offset-1;
  958. if (o1 >= con.limit || o1 < 0)
  959. return -1;
  960. int ch = target [ o1 ] ;
  961. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  962. ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch);
  963. RangeToken tok = op.getToken();
  964. if (isSet(opts, IGNORE_CASE)) {
  965. tok = tok.getCaseInsensitiveToken();
  966. if (!tok.match(ch)) {
  967. if (ch >= 0x10000) return -1;
  968. char uch;
  969. if (!tok.match(uch = Character.toUpperCase((char)ch))
  970. && !tok.match(Character.toLowerCase(uch)))
  971. return -1;
  972. }
  973. } else {
  974. if (!tok.match(ch)) return -1;
  975. }
  976. offset = o1;
  977. }
  978. op = op.next;
  979. break;
  980. case Op.ANCHOR:
  981. boolean go = false;
  982. switch (op.getData()) {
  983. case '^':
  984. if (isSet(opts, MULTIPLE_LINES)) {
  985. if (!(offset == con.start
  986. || offset > con.start && isEOLChar( target [ offset-1 ] )))
  987. return -1;
  988. } else {
  989. if (offset != con.start)
  990. return -1;
  991. }
  992. break;
  993. case '@': // Internal use only.
  994. // The @ always matches line beginnings.
  995. if (!(offset == con.start
  996. || offset > con.start && isEOLChar( target [ offset-1 ] )))
  997. return -1;
  998. break;
  999. case '$':
  1000. if (isSet(opts, MULTIPLE_LINES)) {
  1001. if (!(offset == con.limit
  1002. || offset < con.limit && isEOLChar( target [ offset ] )))
  1003. return -1;
  1004. } else {
  1005. if (!(offset == con.limit
  1006. || offset+1 == con.limit && isEOLChar( target [ offset ] )
  1007. || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
  1008. && target [ offset+1 ] == LINE_FEED))
  1009. return -1;
  1010. }
  1011. break;
  1012. case 'A':
  1013. if (offset != con.start) return -1;
  1014. break;
  1015. case 'Z':
  1016. if (!(offset == con.limit
  1017. || offset+1 == con.limit && isEOLChar( target [ offset ] )
  1018. || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN
  1019. && target [ offset+1 ] == LINE_FEED))
  1020. return -1;
  1021. break;
  1022. case 'z':
  1023. if (offset != con.limit) return -1;
  1024. break;
  1025. case 'b':
  1026. if (con.length == 0) return -1;
  1027. {
  1028. int after = getWordType(target, con.start, con.limit, offset, opts);
  1029. if (after == WT_IGNORE) return -1;
  1030. int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
  1031. if (after == before) return -1;
  1032. }
  1033. break;
  1034. case 'B':
  1035. if (con.length == 0)
  1036. go = true;
  1037. else {
  1038. int after = getWordType(target, con.start, con.limit, offset, opts);
  1039. go = after == WT_IGNORE
  1040. || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
  1041. }
  1042. if (!go) return -1;
  1043. break;
  1044. case '<':
  1045. if (con.length == 0 || offset == con.limit) return -1;
  1046. if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
  1047. || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
  1048. return -1;
  1049. break;
  1050. case '>':
  1051. if (con.length == 0 || offset == con.start) return -1;
  1052. if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
  1053. || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
  1054. return -1;
  1055. break;
  1056. } // switch anchor type
  1057. op = op.next;
  1058. break;
  1059. case Op.BACKREFERENCE:
  1060. {
  1061. int refno = op.getData();
  1062. if (refno <= 0 || refno >= this.nofparen)
  1063. throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
  1064. if (con.match.getBeginning(refno) < 0
  1065. || con.match.getEnd(refno) < 0)
  1066. return -1; // ********
  1067. int o2 = con.match.getBeginning(refno);
  1068. int literallen = con.match.getEnd(refno)-o2;
  1069. if (!isSet(opts, IGNORE_CASE)) {
  1070. if (dx > 0) {
  1071. if (!regionMatches(target, offset, con.limit, o2, literallen))
  1072. return -1;
  1073. offset += literallen;
  1074. } else {
  1075. if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
  1076. return -1;
  1077. offset -= literallen;
  1078. }
  1079. } else {
  1080. if (dx > 0) {
  1081. if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
  1082. return -1;
  1083. offset += literallen;
  1084. } else {
  1085. if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
  1086. o2, literallen))
  1087. return -1;
  1088. offset -= literallen;
  1089. }
  1090. }
  1091. }
  1092. op = op.next;
  1093. break;
  1094. case Op.STRING:
  1095. {
  1096. String literal = op.getString();
  1097. int literallen = literal.length();
  1098. if (!isSet(opts, IGNORE_CASE)) {
  1099. if (dx > 0) {
  1100. if (!regionMatches(target, offset, con.limit, literal, literallen))
  1101. return -1;
  1102. offset += literallen;
  1103. } else {
  1104. if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
  1105. return -1;
  1106. offset -= literallen;
  1107. }
  1108. } else {
  1109. if (dx > 0) {
  1110. if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
  1111. return -1;
  1112. offset += literallen;
  1113. } else {
  1114. if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
  1115. literal, literallen))
  1116. return -1;
  1117. offset -= literallen;
  1118. }
  1119. }
  1120. }
  1121. op = op.next;
  1122. break;
  1123. case Op.CLOSURE:
  1124. {
  1125. /*
  1126. * Saves current position to avoid
  1127. * zero-width repeats.
  1128. */
  1129. int id = op.getData();
  1130. if (id >= 0) {
  1131. int previousOffset = con.offsets[id];
  1132. if (previousOffset < 0 || previousOffset != offset) {
  1133. con.offsets[id] = offset;
  1134. } else {
  1135. con.offsets[id] = -1;
  1136. op = op.next;
  1137. break;
  1138. }
  1139. }
  1140. int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
  1141. if (id >= 0) con.offsets[id] = -1;
  1142. if (ret >= 0) return ret;
  1143. op = op.next;
  1144. }
  1145. break;
  1146. case Op.QUESTION:
  1147. {
  1148. int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
  1149. if (ret >= 0) return ret;
  1150. op = op.next;
  1151. }
  1152. break;
  1153. case Op.NONGREEDYCLOSURE:
  1154. case Op.NONGREEDYQUESTION:
  1155. {
  1156. int ret = this. matchCharArray (con, op.next, offset, dx, opts);
  1157. if (ret >= 0) return ret;
  1158. op = op.getChild();
  1159. }
  1160. break;
  1161. case Op.UNION:
  1162. for (int i = 0; i < op.size(); i ++) {
  1163. int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
  1164. if (DEBUG) {
  1165. System.err.println("UNION: "+i+", ret="+ret);
  1166. }
  1167. if (ret >= 0) return ret;
  1168. }
  1169. return -1;
  1170. case Op.CAPTURE:
  1171. int refno = op.getData();
  1172. if (con.match != null && refno > 0) {
  1173. int save = con.match.getBeginning(refno);
  1174. con.match.setBeginning(refno, offset);
  1175. int ret = this. matchCharArray (con, op.next, offset, dx, opts);
  1176. if (ret < 0) con.match.setBeginning(refno, save);
  1177. return ret;
  1178. } else if (con.match != null && refno < 0) {
  1179. int index = -refno;
  1180. int save = con.match.getEnd(index);
  1181. con.match.setEnd(index, offset);
  1182. int ret = this. matchCharArray (con, op.next, offset, dx, opts);
  1183. if (ret < 0) con.match.setEnd(index, save);
  1184. return ret;
  1185. }
  1186. op = op.next;
  1187. break;
  1188. case Op.LOOKAHEAD:
  1189. if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
  1190. op = op.next;
  1191. break;
  1192. case Op.NEGATIVELOOKAHEAD:
  1193. if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1;
  1194. op = op.next;
  1195. break;
  1196. case Op.LOOKBEHIND:
  1197. if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
  1198. op = op.next;
  1199. break;
  1200. case Op.NEGATIVELOOKBEHIND:
  1201. if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1;
  1202. op = op.next;
  1203. break;
  1204. case Op.INDEPENDENT:
  1205. {
  1206. int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
  1207. if (ret < 0) return ret;
  1208. offset = ret;
  1209. op = op.next;
  1210. }
  1211. break;
  1212. case Op.MODIFIER:
  1213. {
  1214. int localopts = opts;
  1215. localopts |= op.getData();
  1216. localopts &= ~op.getData2();
  1217. //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
  1218. int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
  1219. if (ret < 0) return ret;
  1220. offset = ret;
  1221. op = op.next;
  1222. }
  1223. break;
  1224. case Op.CONDITION:
  1225. {
  1226. Op.ConditionOp cop = (Op.ConditionOp)op;
  1227. boolean matchp = false;
  1228. if (cop.refNumber > 0) {
  1229. if (cop.refNumber >= this.nofparen)
  1230. throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
  1231. matchp = con.match.getBeginning(cop.refNumber) >= 0
  1232. && con.match.getEnd(cop.refNumber) >= 0;
  1233. } else {
  1234. matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
  1235. }
  1236. if (matchp) {
  1237. op = cop.yes;
  1238. } else if (cop.no != null) {
  1239. op = cop.no;
  1240. } else {
  1241. op = cop.next;
  1242. }
  1243. }
  1244. break;
  1245. default:
  1246. throw new RuntimeException("Unknown operation type: "+op.type);
  1247. } // switch (op.type)
  1248. } // while
  1249. }
  1250. private static final int getPreviousWordType(char[] target, int begin, int end,
  1251. int offset, int opts) {
  1252. int ret = getWordType(target, begin, end, --offset, opts);
  1253. while (ret == WT_IGNORE)
  1254. ret = getWordType(target, begin, end, --offset, opts);
  1255. return ret;
  1256. }
  1257. private static final int getWordType(char[] target, int begin, int end,
  1258. int offset, int opts) {
  1259. if (offset < begin || offset >= end) return WT_OTHER;
  1260. return getWordType0( target [ offset ] , opts);
  1261. }
  1262. private static final boolean regionMatches(char[] target, int offset, int limit,
  1263. String part, int partlen) {
  1264. if (offset < 0) return false;
  1265. if (limit-offset < partlen)
  1266. return false;
  1267. int i = 0;
  1268. while (partlen-- > 0) {
  1269. if ( target [ offset++ ] != part.charAt(i++))
  1270. return false;
  1271. }
  1272. return true;
  1273. }
  1274. private static final boolean regionMatches(char[] target, int offset, int limit,
  1275. int offset2, int partlen) {
  1276. if (offset < 0) return false;
  1277. if (limit-offset < partlen)
  1278. return false;
  1279. int i = offset2;
  1280. while (partlen-- > 0) {
  1281. if ( target [ offset++ ] != target [ i++ ] )
  1282. return false;
  1283. }
  1284. return true;
  1285. }
  1286. /**
  1287. * @see java.lang.String#regionMatches
  1288. */
  1289. private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
  1290. String part, int partlen) {
  1291. if (offset < 0) return false;
  1292. if (limit-offset < partlen)
  1293. return false;
  1294. int i = 0;
  1295. while (partlen-- > 0) {
  1296. char ch1 = target [ offset++ ] ;
  1297. char ch2 = part.charAt(i++);
  1298. if (ch1 == ch2)
  1299. continue;
  1300. char uch1 = Character.toUpperCase(ch1);
  1301. char uch2 = Character.toUpperCase(ch2);
  1302. if (uch1 == uch2)
  1303. continue;
  1304. if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
  1305. return false;
  1306. }
  1307. return true;
  1308. }
  1309. private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit,
  1310. int offset2, int partlen) {
  1311. if (offset < 0) return false;
  1312. if (limit-offset < partlen)
  1313. return false;
  1314. int i = offset2;
  1315. while (partlen-- > 0) {
  1316. char ch1 = target [ offset++ ] ;
  1317. char ch2 = target [ i++ ] ;
  1318. if (ch1 == ch2)
  1319. continue;
  1320. char uch1 = Character.toUpperCase(ch1);
  1321. char uch2 = Character.toUpperCase(ch2);
  1322. if (uch1 == uch2)
  1323. continue;
  1324. if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
  1325. return false;
  1326. }
  1327. return true;
  1328. }
  1329. /**
  1330. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  1331. *
  1332. * @return true if the target is matched to this regular expression.
  1333. */
  1334. public boolean matches(String target) {
  1335. return this.matches(target, 0, target .length() , (Match)null);
  1336. }
  1337. /**
  1338. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  1339. * in specified range or not.
  1340. *
  1341. * @param start Start offset of the range.
  1342. * @param end End offset +1 of the range.
  1343. * @return true if the target is matched to this regular expression.
  1344. */
  1345. public boolean matches(String target, int start, int end) {
  1346. return this.matches(target, start, end, (Match)null);
  1347. }
  1348. /**
  1349. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  1350. *
  1351. * @param match A Match instance for storing matching result.
  1352. * @return Offset of the start position in <VAR>target</VAR> or -1 if not match.
  1353. */
  1354. public boolean matches(String target, Match match) {
  1355. return this.matches(target, 0, target .length() , match);
  1356. }
  1357. /**
  1358. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  1359. * in specified range or not.
  1360. *
  1361. * @param start Start offset of the range.
  1362. * @param end End offset +1 of the range.
  1363. * @param match A Match instance for storing matching result.
  1364. * @return Offset of the start position in <VAR>target</VAR> or -1 if not match.
  1365. */
  1366. public boolean matches(String target, int start, int end, Match match) {
  1367. synchronized (this) {
  1368. if (this.operations == null)
  1369. this.prepare();
  1370. if (this.context == null)
  1371. this.context = new Context();
  1372. }
  1373. Context con = null;
  1374. synchronized (this.context) {
  1375. con = this.context.inuse ? new Context() : this.context;
  1376. con.reset(target, start, end, this.numberOfClosures);
  1377. }
  1378. if (match != null) {
  1379. match.setNumberOfGroups(this.nofparen);
  1380. match.setSource(target);
  1381. } else if (this.hasBackReferences) {
  1382. match = new Match();
  1383. match.setNumberOfGroups(this.nofparen);
  1384. // Need not to call setSource() because
  1385. // a caller can not access this match instance.
  1386. }
  1387. con.match = match;
  1388. if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
  1389. if (DEBUG) {
  1390. System.err.println("target string="+target);
  1391. }
  1392. int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
  1393. if (DEBUG) {
  1394. System.err.println("matchEnd="+matchEnd);
  1395. System.err.println("con.limit="+con.limit);
  1396. }
  1397. if (matchEnd == con.limit) {
  1398. if (con.match != null) {
  1399. con.match.setBeginning(0, con.start);
  1400. con.match.setEnd(0, matchEnd);
  1401. }
  1402. con.inuse = false;
  1403. return true;
  1404. }
  1405. return false;
  1406. }
  1407. /*
  1408. * The pattern has only fixed string.
  1409. * The engine uses Boyer-Moore.
  1410. */
  1411. if (this.fixedStringOnly) {
  1412. //System.err.println("DEBUG: fixed-only: "+this.fixedString);
  1413. int o = this.fixedStringTable.matches(target, con.start, con.limit);
  1414. if (o >= 0) {
  1415. if (con.match != null) {
  1416. con.match.setBeginning(0, o);
  1417. con.match.setEnd(0, o+this.fixedString.length());
  1418. }
  1419. con.inuse = false;
  1420. return true;
  1421. }
  1422. con.inuse = false;
  1423. return false;
  1424. }
  1425. /*
  1426. * The pattern contains a fixed string.
  1427. * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
  1428. * If not, it return with false.
  1429. */
  1430. if (this.fixedString != null) {
  1431. int o = this.fixedStringTable.matches(target, con.start, con.limit);
  1432. if (o < 0) {
  1433. //System.err.println("Non-match in fixed-string search.");
  1434. con.inuse = false;
  1435. return false;
  1436. }
  1437. }
  1438. int limit = con.limit-this.minlength;
  1439. int matchStart;
  1440. int matchEnd = -1;
  1441. /*
  1442. * Checks whether the expression starts with ".*".
  1443. */
  1444. if (this.operations != null
  1445. && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  1446. if (isSet(this.options, SINGLE_LINE)) {
  1447. matchStart = con.start;
  1448. matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
  1449. } else {
  1450. boolean previousIsEOL = true;
  1451. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  1452. int ch = target .charAt( matchStart ) ;
  1453. if (isEOLChar(ch)) {
  1454. previousIsEOL = true;
  1455. } else {
  1456. if (previousIsEOL) {
  1457. if (0 <= (matchEnd = this. matchString (con, this.operations,
  1458. matchStart, 1, this.options)))
  1459. break;
  1460. }
  1461. previousIsEOL = false;
  1462. }
  1463. }
  1464. }
  1465. }
  1466. /*
  1467. * Optimization against the first character.
  1468. */
  1469. else if (this.firstChar != null) {
  1470. //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
  1471. RangeToken range = this.firstChar;
  1472. if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
  1473. range = this.firstChar.getCaseInsensitiveToken();
  1474. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  1475. int ch = target .charAt( matchStart ) ;
  1476. if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
  1477. ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
  1478. if (!range.match(ch)) continue;
  1479. } else {
  1480. if (!range.match(ch)) {
  1481. char ch1 = Character.toUpperCase((char)ch);
  1482. if (!range.match(ch1))
  1483. if (!range.match(Character.toLowerCase(ch1)))
  1484. continue;
  1485. }
  1486. }
  1487. if (0 <= (matchEnd = this. matchString (con, this.operations,
  1488. matchStart, 1, this.options)))
  1489. break;
  1490. }
  1491. } else {
  1492. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  1493. int ch = target .charAt( matchStart ) ;
  1494. if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
  1495. ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) );
  1496. if (!range.match(ch)) continue;
  1497. if (0 <= (matchEnd = this. matchString (con, this.operations,
  1498. matchStart, 1, this.options)))
  1499. break;
  1500. }
  1501. }
  1502. }
  1503. /*
  1504. * Straightforward matching.
  1505. */
  1506. else {
  1507. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  1508. if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
  1509. break;
  1510. }
  1511. }
  1512. if (matchEnd >= 0) {
  1513. if (con.match != null) {
  1514. con.match.setBeginning(0, matchStart);
  1515. con.match.setEnd(0, matchEnd);
  1516. }
  1517. con.inuse = false;
  1518. return true;
  1519. } else {
  1520. con.inuse = false;
  1521. return false;
  1522. }
  1523. }
  1524. /**
  1525. * @return -1 when not match; offset of the end of matched string when match.
  1526. */
  1527. private int matchString (Context con, Op op, int offset, int dx, int opts) {
  1528. String target = con.strTarget;
  1529. while (true) {
  1530. if (op == null)
  1531. return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
  1532. if (offset > con.limit || offset < con.start)
  1533. return -1;
  1534. switch (op.type) {
  1535. case Op.CHAR:
  1536. if (isSet(opts, IGNORE_CASE)) {
  1537. int ch = op.getData();
  1538. if (dx > 0) {
  1539. if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) ))
  1540. return -1;
  1541. offset ++;
  1542. } else {
  1543. int o1 = offset-1;
  1544. if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) ))
  1545. return -1;
  1546. offset = o1;
  1547. }
  1548. } else {
  1549. int ch = op.getData();
  1550. if (dx > 0) {
  1551. if (offset >= con.limit || ch != target .charAt( offset ) )
  1552. return -1;
  1553. offset ++;
  1554. } else {
  1555. int o1 = offset-1;
  1556. if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) )
  1557. return -1;
  1558. offset = o1;
  1559. }
  1560. }
  1561. op = op.next;
  1562. break;
  1563. case Op.DOT:
  1564. if (dx > 0) {
  1565. if (offset >= con.limit)
  1566. return -1;
  1567. int ch = target .charAt( offset ) ;
  1568. if (isSet(opts, SINGLE_LINE)) {
  1569. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  1570. offset ++;
  1571. } else {
  1572. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  1573. ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
  1574. if (isEOLChar(ch))
  1575. return -1;
  1576. }
  1577. offset ++;
  1578. } else {
  1579. int o1 = offset-1;
  1580. if (o1 >= con.limit || o1 < 0)
  1581. return -1;
  1582. int ch = target .charAt( o1 ) ;
  1583. if (isSet(opts, SINGLE_LINE)) {
  1584. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  1585. o1 --;
  1586. } else {
  1587. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  1588. ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
  1589. if (!isEOLChar(ch))
  1590. return -1;
  1591. }
  1592. offset = o1;
  1593. }
  1594. op = op.next;
  1595. break;
  1596. case Op.RANGE:
  1597. case Op.NRANGE:
  1598. if (dx > 0) {
  1599. if (offset >= con.limit)
  1600. return -1;
  1601. int ch = target .charAt( offset ) ;
  1602. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  1603. ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) );
  1604. RangeToken tok = op.getToken();
  1605. if (isSet(opts, IGNORE_CASE)) {
  1606. tok = tok.getCaseInsensitiveToken();
  1607. if (!tok.match(ch)) {
  1608. if (ch >= 0x10000) return -1;
  1609. char uch;
  1610. if (!tok.match(uch = Character.toUpperCase((char)ch))
  1611. && !tok.match(Character.toLowerCase(uch)))
  1612. return -1;
  1613. }
  1614. } else {
  1615. if (!tok.match(ch)) return -1;
  1616. }
  1617. offset ++;
  1618. } else {
  1619. int o1 = offset-1;
  1620. if (o1 >= con.limit || o1 < 0)
  1621. return -1;
  1622. int ch = target .charAt( o1 ) ;
  1623. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  1624. ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch);
  1625. RangeToken tok = op.getToken();
  1626. if (isSet(opts, IGNORE_CASE)) {
  1627. tok = tok.getCaseInsensitiveToken();
  1628. if (!tok.match(ch)) {
  1629. if (ch >= 0x10000) return -1;
  1630. char uch;
  1631. if (!tok.match(uch = Character.toUpperCase((char)ch))
  1632. && !tok.match(Character.toLowerCase(uch)))
  1633. return -1;
  1634. }
  1635. } else {
  1636. if (!tok.match(ch)) return -1;
  1637. }
  1638. offset = o1;
  1639. }
  1640. op = op.next;
  1641. break;
  1642. case Op.ANCHOR:
  1643. boolean go = false;
  1644. switch (op.getData()) {
  1645. case '^':
  1646. if (isSet(opts, MULTIPLE_LINES)) {
  1647. if (!(offset == con.start
  1648. || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
  1649. return -1;
  1650. } else {
  1651. if (offset != con.start)
  1652. return -1;
  1653. }
  1654. break;
  1655. case '@': // Internal use only.
  1656. // The @ always matches line beginnings.
  1657. if (!(offset == con.start
  1658. || offset > con.start && isEOLChar( target .charAt( offset-1 ) )))
  1659. return -1;
  1660. break;
  1661. case '$':
  1662. if (isSet(opts, MULTIPLE_LINES)) {
  1663. if (!(offset == con.limit
  1664. || offset < con.limit && isEOLChar( target .charAt( offset ) )))
  1665. return -1;
  1666. } else {
  1667. if (!(offset == con.limit
  1668. || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
  1669. || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
  1670. && target .charAt( offset+1 ) == LINE_FEED))
  1671. return -1;
  1672. }
  1673. break;
  1674. case 'A':
  1675. if (offset != con.start) return -1;
  1676. break;
  1677. case 'Z':
  1678. if (!(offset == con.limit
  1679. || offset+1 == con.limit && isEOLChar( target .charAt( offset ) )
  1680. || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN
  1681. && target .charAt( offset+1 ) == LINE_FEED))
  1682. return -1;
  1683. break;
  1684. case 'z':
  1685. if (offset != con.limit) return -1;
  1686. break;
  1687. case 'b':
  1688. if (con.length == 0) return -1;
  1689. {
  1690. int after = getWordType(target, con.start, con.limit, offset, opts);
  1691. if (after == WT_IGNORE) return -1;
  1692. int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
  1693. if (after == before) return -1;
  1694. }
  1695. break;
  1696. case 'B':
  1697. if (con.length == 0)
  1698. go = true;
  1699. else {
  1700. int after = getWordType(target, con.start, con.limit, offset, opts);
  1701. go = after == WT_IGNORE
  1702. || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
  1703. }
  1704. if (!go) return -1;
  1705. break;
  1706. case '<':
  1707. if (con.length == 0 || offset == con.limit) return -1;
  1708. if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
  1709. || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
  1710. return -1;
  1711. break;
  1712. case '>':
  1713. if (con.length == 0 || offset == con.start) return -1;
  1714. if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
  1715. || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
  1716. return -1;
  1717. break;
  1718. } // switch anchor type
  1719. op = op.next;
  1720. break;
  1721. case Op.BACKREFERENCE:
  1722. {
  1723. int refno = op.getData();
  1724. if (refno <= 0 || refno >= this.nofparen)
  1725. throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
  1726. if (con.match.getBeginning(refno) < 0
  1727. || con.match.getEnd(refno) < 0)
  1728. return -1; // ********
  1729. int o2 = con.match.getBeginning(refno);
  1730. int literallen = con.match.getEnd(refno)-o2;
  1731. if (!isSet(opts, IGNORE_CASE)) {
  1732. if (dx > 0) {
  1733. if (!regionMatches(target, offset, con.limit, o2, literallen))
  1734. return -1;
  1735. offset += literallen;
  1736. } else {
  1737. if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
  1738. return -1;
  1739. offset -= literallen;
  1740. }
  1741. } else {
  1742. if (dx > 0) {
  1743. if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
  1744. return -1;
  1745. offset += literallen;
  1746. } else {
  1747. if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
  1748. o2, literallen))
  1749. return -1;
  1750. offset -= literallen;
  1751. }
  1752. }
  1753. }
  1754. op = op.next;
  1755. break;
  1756. case Op.STRING:
  1757. {
  1758. String literal = op.getString();
  1759. int literallen = literal.length();
  1760. if (!isSet(opts, IGNORE_CASE)) {
  1761. if (dx > 0) {
  1762. if (!regionMatches(target, offset, con.limit, literal, literallen))
  1763. return -1;
  1764. offset += literallen;
  1765. } else {
  1766. if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
  1767. return -1;
  1768. offset -= literallen;
  1769. }
  1770. } else {
  1771. if (dx > 0) {
  1772. if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
  1773. return -1;
  1774. offset += literallen;
  1775. } else {
  1776. if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
  1777. literal, literallen))
  1778. return -1;
  1779. offset -= literallen;
  1780. }
  1781. }
  1782. }
  1783. op = op.next;
  1784. break;
  1785. case Op.CLOSURE:
  1786. {
  1787. /*
  1788. * Saves current position to avoid
  1789. * zero-width repeats.
  1790. */
  1791. int id = op.getData();
  1792. if (id >= 0) {
  1793. int previousOffset = con.offsets[id];
  1794. if (previousOffset < 0 || previousOffset != offset) {
  1795. con.offsets[id] = offset;
  1796. } else {
  1797. con.offsets[id] = -1;
  1798. op = op.next;
  1799. break;
  1800. }
  1801. }
  1802. int ret = this. matchString (con, op.getChild(), offset, dx, opts);
  1803. if (id >= 0) con.offsets[id] = -1;
  1804. if (ret >= 0) return ret;
  1805. op = op.next;
  1806. }
  1807. break;
  1808. case Op.QUESTION:
  1809. {
  1810. int ret = this. matchString (con, op.getChild(), offset, dx, opts);
  1811. if (ret >= 0) return ret;
  1812. op = op.next;
  1813. }
  1814. break;
  1815. case Op.NONGREEDYCLOSURE:
  1816. case Op.NONGREEDYQUESTION:
  1817. {
  1818. int ret = this. matchString (con, op.next, offset, dx, opts);
  1819. if (ret >= 0) return ret;
  1820. op = op.getChild();
  1821. }
  1822. break;
  1823. case Op.UNION:
  1824. for (int i = 0; i < op.size(); i ++) {
  1825. int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
  1826. if (DEBUG) {
  1827. System.err.println("UNION: "+i+", ret="+ret);
  1828. }
  1829. if (ret >= 0) return ret;
  1830. }
  1831. return -1;
  1832. case Op.CAPTURE:
  1833. int refno = op.getData();
  1834. if (con.match != null && refno > 0) {
  1835. int save = con.match.getBeginning(refno);
  1836. con.match.setBeginning(refno, offset);
  1837. int ret = this. matchString (con, op.next, offset, dx, opts);
  1838. if (ret < 0) con.match.setBeginning(refno, save);
  1839. return ret;
  1840. } else if (con.match != null && refno < 0) {
  1841. int index = -refno;
  1842. int save = con.match.getEnd(index);
  1843. con.match.setEnd(index, offset);
  1844. int ret = this. matchString (con, op.next, offset, dx, opts);
  1845. if (ret < 0) con.match.setEnd(index, save);
  1846. return ret;
  1847. }
  1848. op = op.next;
  1849. break;
  1850. case Op.LOOKAHEAD:
  1851. if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
  1852. op = op.next;
  1853. break;
  1854. case Op.NEGATIVELOOKAHEAD:
  1855. if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1;
  1856. op = op.next;
  1857. break;
  1858. case Op.LOOKBEHIND:
  1859. if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
  1860. op = op.next;
  1861. break;
  1862. case Op.NEGATIVELOOKBEHIND:
  1863. if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1;
  1864. op = op.next;
  1865. break;
  1866. case Op.INDEPENDENT:
  1867. {
  1868. int ret = this. matchString (con, op.getChild(), offset, dx, opts);
  1869. if (ret < 0) return ret;
  1870. offset = ret;
  1871. op = op.next;
  1872. }
  1873. break;
  1874. case Op.MODIFIER:
  1875. {
  1876. int localopts = opts;
  1877. localopts |= op.getData();
  1878. localopts &= ~op.getData2();
  1879. //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
  1880. int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
  1881. if (ret < 0) return ret;
  1882. offset = ret;
  1883. op = op.next;
  1884. }
  1885. break;
  1886. case Op.CONDITION:
  1887. {
  1888. Op.ConditionOp cop = (Op.ConditionOp)op;
  1889. boolean matchp = false;
  1890. if (cop.refNumber > 0) {
  1891. if (cop.refNumber >= this.nofparen)
  1892. throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
  1893. matchp = con.match.getBeginning(cop.refNumber) >= 0
  1894. && con.match.getEnd(cop.refNumber) >= 0;
  1895. } else {
  1896. matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
  1897. }
  1898. if (matchp) {
  1899. op = cop.yes;
  1900. } else if (cop.no != null) {
  1901. op = cop.no;
  1902. } else {
  1903. op = cop.next;
  1904. }
  1905. }
  1906. break;
  1907. default:
  1908. throw new RuntimeException("Unknown operation type: "+op.type);
  1909. } // switch (op.type)
  1910. } // while
  1911. }
  1912. private static final int getPreviousWordType(String target, int begin, int end,
  1913. int offset, int opts) {
  1914. int ret = getWordType(target, begin, end, --offset, opts);
  1915. while (ret == WT_IGNORE)
  1916. ret = getWordType(target, begin, end, --offset, opts);
  1917. return ret;
  1918. }
  1919. private static final int getWordType(String target, int begin, int end,
  1920. int offset, int opts) {
  1921. if (offset < begin || offset >= end) return WT_OTHER;
  1922. return getWordType0( target .charAt( offset ) , opts);
  1923. }
  1924. private static final boolean regionMatches(String text, int offset, int limit,
  1925. String part, int partlen) {
  1926. if (limit-offset < partlen) return false;
  1927. return text.regionMatches(offset, part, 0, partlen);
  1928. }
  1929. private static final boolean regionMatches(String text, int offset, int limit,
  1930. int offset2, int partlen) {
  1931. if (limit-offset < partlen) return false;
  1932. return text.regionMatches(offset, text, offset2, partlen);
  1933. }
  1934. private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit,
  1935. String part, int partlen) {
  1936. return text.regionMatches(true, offset, part, 0, partlen);
  1937. }
  1938. private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit,
  1939. int offset2, int partlen) {
  1940. if (limit-offset < partlen) return false;
  1941. return text.regionMatches(true, offset, text, offset2, partlen);
  1942. }
  1943. /**
  1944. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  1945. *
  1946. * @return true if the target is matched to this regular expression.
  1947. */
  1948. public boolean matches(CharacterIterator target) {
  1949. return this.matches(target, (Match)null);
  1950. }
  1951. /**
  1952. * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  1953. *
  1954. * @param match A Match instance for storing matching result.
  1955. * @return Offset of the start position in <VAR>target</VAR> or -1 if not match.
  1956. */
  1957. public boolean matches(CharacterIterator target, Match match) {
  1958. int start = target.getBeginIndex();
  1959. int end = target.getEndIndex();
  1960. synchronized (this) {
  1961. if (this.operations == null)
  1962. this.prepare();
  1963. if (this.context == null)
  1964. this.context = new Context();
  1965. }
  1966. Context con = null;
  1967. synchronized (this.context) {
  1968. con = this.context.inuse ? new Context() : this.context;
  1969. con.reset(target, start, end, this.numberOfClosures);
  1970. }
  1971. if (match != null) {
  1972. match.setNumberOfGroups(this.nofparen);
  1973. match.setSource(target);
  1974. } else if (this.hasBackReferences) {
  1975. match = new Match();
  1976. match.setNumberOfGroups(this.nofparen);
  1977. // Need not to call setSource() because
  1978. // a caller can not access this match instance.
  1979. }
  1980. con.match = match;
  1981. if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
  1982. int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
  1983. //System.err.println("DEBUG: matchEnd="+matchEnd);
  1984. if (matchEnd == con.limit) {
  1985. if (con.match != null) {
  1986. con.match.setBeginning(0, con.start);
  1987. con.match.setEnd(0, matchEnd);
  1988. }
  1989. con.inuse = false;
  1990. return true;
  1991. }
  1992. return false;
  1993. }
  1994. /*
  1995. * The pattern has only fixed string.
  1996. * The engine uses Boyer-Moore.
  1997. */
  1998. if (this.fixedStringOnly) {
  1999. //System.err.println("DEBUG: fixed-only: "+this.fixedString);
  2000. int o = this.fixedStringTable.matches(target, con.start, con.limit);
  2001. if (o >= 0) {
  2002. if (con.match != null) {
  2003. con.match.setBeginning(0, o);
  2004. con.match.setEnd(0, o+this.fixedString.length());
  2005. }
  2006. con.inuse = false;
  2007. return true;
  2008. }
  2009. con.inuse = false;
  2010. return false;
  2011. }
  2012. /*
  2013. * The pattern contains a fixed string.
  2014. * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
  2015. * If not, it return with false.
  2016. */
  2017. if (this.fixedString != null) {
  2018. int o = this.fixedStringTable.matches(target, con.start, con.limit);
  2019. if (o < 0) {
  2020. //System.err.println("Non-match in fixed-string search.");
  2021. con.inuse = false;
  2022. return false;
  2023. }
  2024. }
  2025. int limit = con.limit-this.minlength;
  2026. int matchStart;
  2027. int matchEnd = -1;
  2028. /*
  2029. * Checks whether the expression starts with ".*".
  2030. */
  2031. if (this.operations != null
  2032. && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  2033. if (isSet(this.options, SINGLE_LINE)) {
  2034. matchStart = con.start;
  2035. matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
  2036. } else {
  2037. boolean previousIsEOL = true;
  2038. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  2039. int ch = target .setIndex( matchStart ) ;
  2040. if (isEOLChar(ch)) {
  2041. previousIsEOL = true;
  2042. } else {
  2043. if (previousIsEOL) {
  2044. if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
  2045. matchStart, 1, this.options)))
  2046. break;
  2047. }
  2048. previousIsEOL = false;
  2049. }
  2050. }
  2051. }
  2052. }
  2053. /*
  2054. * Optimization against the first character.
  2055. */
  2056. else if (this.firstChar != null) {
  2057. //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
  2058. RangeToken range = this.firstChar;
  2059. if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
  2060. range = this.firstChar.getCaseInsensitiveToken();
  2061. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  2062. int ch = target .setIndex( matchStart ) ;
  2063. if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
  2064. ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
  2065. if (!range.match(ch)) continue;
  2066. } else {
  2067. if (!range.match(ch)) {
  2068. char ch1 = Character.toUpperCase((char)ch);
  2069. if (!range.match(ch1))
  2070. if (!range.match(Character.toLowerCase(ch1)))
  2071. continue;
  2072. }
  2073. }
  2074. if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
  2075. matchStart, 1, this.options)))
  2076. break;
  2077. }
  2078. } else {
  2079. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  2080. int ch = target .setIndex( matchStart ) ;
  2081. if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
  2082. ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) );
  2083. if (!range.match(ch)) continue;
  2084. if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
  2085. matchStart, 1, this.options)))
  2086. break;
  2087. }
  2088. }
  2089. }
  2090. /*
  2091. * Straightforward matching.
  2092. */
  2093. else {
  2094. for (matchStart = con.start; matchStart <= limit; matchStart ++) {
  2095. if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
  2096. break;
  2097. }
  2098. }
  2099. if (matchEnd >= 0) {
  2100. if (con.match != null) {
  2101. con.match.setBeginning(0, matchStart);
  2102. con.match.setEnd(0, matchEnd);
  2103. }
  2104. con.inuse = false;
  2105. return true;
  2106. } else {
  2107. con.inuse = false;
  2108. return false;
  2109. }
  2110. }
  2111. /**
  2112. * @return -1 when not match; offset of the end of matched string when match.
  2113. */
  2114. private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
  2115. CharacterIterator target = con.ciTarget;
  2116. while (true) {
  2117. if (op == null)
  2118. return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
  2119. if (offset > con.limit || offset < con.start)
  2120. return -1;
  2121. switch (op.type) {
  2122. case Op.CHAR:
  2123. if (isSet(opts, IGNORE_CASE)) {
  2124. int ch = op.getData();
  2125. if (dx > 0) {
  2126. if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) ))
  2127. return -1;
  2128. offset ++;
  2129. } else {
  2130. int o1 = offset-1;
  2131. if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) ))
  2132. return -1;
  2133. offset = o1;
  2134. }
  2135. } else {
  2136. int ch = op.getData();
  2137. if (dx > 0) {
  2138. if (offset >= con.limit || ch != target .setIndex( offset ) )
  2139. return -1;
  2140. offset ++;
  2141. } else {
  2142. int o1 = offset-1;
  2143. if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) )
  2144. return -1;
  2145. offset = o1;
  2146. }
  2147. }
  2148. op = op.next;
  2149. break;
  2150. case Op.DOT:
  2151. if (dx > 0) {
  2152. if (offset >= con.limit)
  2153. return -1;
  2154. int ch = target .setIndex( offset ) ;
  2155. if (isSet(opts, SINGLE_LINE)) {
  2156. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  2157. offset ++;
  2158. } else {
  2159. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  2160. ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
  2161. if (isEOLChar(ch))
  2162. return -1;
  2163. }
  2164. offset ++;
  2165. } else {
  2166. int o1 = offset-1;
  2167. if (o1 >= con.limit || o1 < 0)
  2168. return -1;
  2169. int ch = target .setIndex( o1 ) ;
  2170. if (isSet(opts, SINGLE_LINE)) {
  2171. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  2172. o1 --;
  2173. } else {
  2174. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  2175. ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
  2176. if (!isEOLChar(ch))
  2177. return -1;
  2178. }
  2179. offset = o1;
  2180. }
  2181. op = op.next;
  2182. break;
  2183. case Op.RANGE:
  2184. case Op.NRANGE:
  2185. if (dx > 0) {
  2186. if (offset >= con.limit)
  2187. return -1;
  2188. int ch = target .setIndex( offset ) ;
  2189. if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  2190. ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) );
  2191. RangeToken tok = op.getToken();
  2192. if (isSet(opts, IGNORE_CASE)) {
  2193. tok = tok.getCaseInsensitiveToken();
  2194. if (!tok.match(ch)) {
  2195. if (ch >= 0x10000) return -1;
  2196. char uch;
  2197. if (!tok.match(uch = Character.toUpperCase((char)ch))
  2198. && !tok.match(Character.toLowerCase(uch)))
  2199. return -1;
  2200. }
  2201. } else {
  2202. if (!tok.match(ch)) return -1;
  2203. }
  2204. offset ++;
  2205. } else {
  2206. int o1 = offset-1;
  2207. if (o1 >= con.limit || o1 < 0)
  2208. return -1;
  2209. int ch = target .setIndex( o1 ) ;
  2210. if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  2211. ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch);
  2212. RangeToken tok = op.getToken();
  2213. if (isSet(opts, IGNORE_CASE)) {
  2214. tok = tok.getCaseInsensitiveToken();
  2215. if (!tok.match(ch)) {
  2216. if (ch >= 0x10000) return -1;
  2217. char uch;
  2218. if (!tok.match(uch = Character.toUpperCase((char)ch))
  2219. && !tok.match(Character.toLowerCase(uch)))
  2220. return -1;
  2221. }
  2222. } else {
  2223. if (!tok.match(ch)) return -1;
  2224. }
  2225. offset = o1;
  2226. }
  2227. op = op.next;
  2228. break;
  2229. case Op.ANCHOR:
  2230. boolean go = false;
  2231. switch (op.getData()) {
  2232. case '^':
  2233. if (isSet(opts, MULTIPLE_LINES)) {
  2234. if (!(offset == con.start
  2235. || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
  2236. return -1;
  2237. } else {
  2238. if (offset != con.start)
  2239. return -1;
  2240. }
  2241. break;
  2242. case '@': // Internal use only.
  2243. // The @ always matches line beginnings.
  2244. if (!(offset == con.start
  2245. || offset > con.start && isEOLChar( target .setIndex( offset-1 ) )))
  2246. return -1;
  2247. break;
  2248. case '$':
  2249. if (isSet(opts, MULTIPLE_LINES)) {
  2250. if (!(offset == con.limit
  2251. || offset < con.limit && isEOLChar( target .setIndex( offset ) )))
  2252. return -1;
  2253. } else {
  2254. if (!(offset == con.limit
  2255. || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
  2256. || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
  2257. && target .setIndex( offset+1 ) == LINE_FEED))
  2258. return -1;
  2259. }
  2260. break;
  2261. case 'A':
  2262. if (offset != con.start) return -1;
  2263. break;
  2264. case 'Z':
  2265. if (!(offset == con.limit
  2266. || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) )
  2267. || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN
  2268. && target .setIndex( offset+1 ) == LINE_FEED))
  2269. return -1;
  2270. break;
  2271. case 'z':
  2272. if (offset != con.limit) return -1;
  2273. break;
  2274. case 'b':
  2275. if (con.length == 0) return -1;
  2276. {
  2277. int after = getWordType(target, con.start, con.limit, offset, opts);
  2278. if (after == WT_IGNORE) return -1;
  2279. int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
  2280. if (after == before) return -1;
  2281. }
  2282. break;
  2283. case 'B':
  2284. if (con.length == 0)
  2285. go = true;
  2286. else {
  2287. int after = getWordType(target, con.start, con.limit, offset, opts);
  2288. go = after == WT_IGNORE
  2289. || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
  2290. }
  2291. if (!go) return -1;
  2292. break;
  2293. case '<':
  2294. if (con.length == 0 || offset == con.limit) return -1;
  2295. if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
  2296. || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
  2297. return -1;
  2298. break;
  2299. case '>':
  2300. if (con.length == 0 || offset == con.start) return -1;
  2301. if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
  2302. || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
  2303. return -1;
  2304. break;
  2305. } // switch anchor type
  2306. op = op.next;
  2307. break;
  2308. case Op.BACKREFERENCE:
  2309. {
  2310. int refno = op.getData();
  2311. if (refno <= 0 || refno >= this.nofparen)
  2312. throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
  2313. if (con.match.getBeginning(refno) < 0
  2314. || con.match.getEnd(refno) < 0)
  2315. return -1; // ********
  2316. int o2 = con.match.getBeginning(refno);
  2317. int literallen = con.match.getEnd(refno)-o2;
  2318. if (!isSet(opts, IGNORE_CASE)) {
  2319. if (dx > 0) {
  2320. if (!regionMatches(target, offset, con.limit, o2, literallen))
  2321. return -1;
  2322. offset += literallen;
  2323. } else {
  2324. if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
  2325. return -1;
  2326. offset -= literallen;
  2327. }
  2328. } else {
  2329. if (dx > 0) {
  2330. if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
  2331. return -1;
  2332. offset += literallen;
  2333. } else {
  2334. if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
  2335. o2, literallen))
  2336. return -1;
  2337. offset -= literallen;
  2338. }
  2339. }
  2340. }
  2341. op = op.next;
  2342. break;
  2343. case Op.STRING:
  2344. {
  2345. String literal = op.getString();
  2346. int literallen = literal.length();
  2347. if (!isSet(opts, IGNORE_CASE)) {
  2348. if (dx > 0) {
  2349. if (!regionMatches(target, offset, con.limit, literal, literallen))
  2350. return -1;
  2351. offset += literallen;
  2352. } else {
  2353. if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
  2354. return -1;
  2355. offset -= literallen;
  2356. }
  2357. } else {
  2358. if (dx > 0) {
  2359. if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
  2360. return -1;
  2361. offset += literallen;
  2362. } else {
  2363. if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
  2364. literal, literallen))
  2365. return -1;
  2366. offset -= literallen;
  2367. }
  2368. }
  2369. }
  2370. op = op.next;
  2371. break;
  2372. case Op.CLOSURE:
  2373. {
  2374. /*
  2375. * Saves current position to avoid
  2376. * zero-width repeats.
  2377. */
  2378. int id = op.getData();
  2379. if (id >= 0) {
  2380. int previousOffset = con.offsets[id];
  2381. if (previousOffset < 0 || previousOffset != offset) {
  2382. con.offsets[id] = offset;
  2383. } else {
  2384. con.offsets[id] = -1;
  2385. op = op.next;
  2386. break;
  2387. }
  2388. }
  2389. int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
  2390. if (id >= 0) con.offsets[id] = -1;
  2391. if (ret >= 0) return ret;
  2392. op = op.next;
  2393. }
  2394. break;
  2395. case Op.QUESTION:
  2396. {
  2397. int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
  2398. if (ret >= 0) return ret;
  2399. op = op.next;
  2400. }
  2401. break;
  2402. case Op.NONGREEDYCLOSURE:
  2403. case Op.NONGREEDYQUESTION:
  2404. {
  2405. int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
  2406. if (ret >= 0) return ret;
  2407. op = op.getChild();
  2408. }
  2409. break;
  2410. case Op.UNION:
  2411. for (int i = 0; i < op.size(); i ++) {
  2412. int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
  2413. if (DEBUG) {
  2414. System.err.println("UNION: "+i+", ret="+ret);
  2415. }
  2416. if (ret >= 0) return ret;
  2417. }
  2418. return -1;
  2419. case Op.CAPTURE:
  2420. int refno = op.getData();
  2421. if (con.match != null && refno > 0) {
  2422. int save = con.match.getBeginning(refno);
  2423. con.match.setBeginning(refno, offset);
  2424. int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
  2425. if (ret < 0) con.match.setBeginning(refno, save);
  2426. return ret;
  2427. } else if (con.match != null && refno < 0) {
  2428. int index = -refno;
  2429. int save = con.match.getEnd(index);
  2430. con.match.setEnd(index, offset);
  2431. int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
  2432. if (ret < 0) con.match.setEnd(index, save);
  2433. return ret;
  2434. }
  2435. op = op.next;
  2436. break;
  2437. case Op.LOOKAHEAD:
  2438. if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
  2439. op = op.next;
  2440. break;
  2441. case Op.NEGATIVELOOKAHEAD:
  2442. if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1;
  2443. op = op.next;
  2444. break;
  2445. case Op.LOOKBEHIND:
  2446. if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
  2447. op = op.next;
  2448. break;
  2449. case Op.NEGATIVELOOKBEHIND:
  2450. if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1;
  2451. op = op.next;
  2452. break;
  2453. case Op.INDEPENDENT:
  2454. {
  2455. int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
  2456. if (ret < 0) return ret;
  2457. offset = ret;
  2458. op = op.next;
  2459. }
  2460. break;
  2461. case Op.MODIFIER:
  2462. {
  2463. int localopts = opts;
  2464. localopts |= op.getData();
  2465. localopts &= ~op.getData2();
  2466. //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
  2467. int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
  2468. if (ret < 0) return ret;
  2469. offset = ret;
  2470. op = op.next;
  2471. }
  2472. break;
  2473. case Op.CONDITION:
  2474. {
  2475. Op.ConditionOp cop = (Op.ConditionOp)op;
  2476. boolean matchp = false;
  2477. if (cop.refNumber > 0) {
  2478. if (cop.refNumber >= this.nofparen)
  2479. throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
  2480. matchp = con.match.getBeginning(cop.refNumber) >= 0
  2481. && con.match.getEnd(cop.refNumber) >= 0;
  2482. } else {
  2483. matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
  2484. }
  2485. if (matchp) {
  2486. op = cop.yes;
  2487. } else if (cop.no != null) {
  2488. op = cop.no;
  2489. } else {
  2490. op = cop.next;
  2491. }
  2492. }
  2493. break;
  2494. default:
  2495. throw new RuntimeException("Unknown operation type: "+op.type);
  2496. } // switch (op.type)
  2497. } // while
  2498. }
  2499. private static final int getPreviousWordType(CharacterIterator target, int begin, int end,
  2500. int offset, int opts) {
  2501. int ret = getWordType(target, begin, end, --offset, opts);
  2502. while (ret == WT_IGNORE)
  2503. ret = getWordType(target, begin, end, --offset, opts);
  2504. return ret;
  2505. }
  2506. private static final int getWordType(CharacterIterator target, int begin, int end,
  2507. int offset, int opts) {
  2508. if (offset < begin || offset >= end) return WT_OTHER;
  2509. return getWordType0( target .setIndex( offset ) , opts);
  2510. }
  2511. private static final boolean regionMatches(CharacterIterator target, int offset, int limit,
  2512. String part, int partlen) {
  2513. if (offset < 0) return false;
  2514. if (limit-offset < partlen)
  2515. return false;
  2516. int i = 0;
  2517. while (partlen-- > 0) {
  2518. if ( target .setIndex( offset++ ) != part.charAt(i++))
  2519. return false;
  2520. }
  2521. return true;
  2522. }
  2523. private static final boolean regionMatches(CharacterIterator target, int offset, int limit,
  2524. int offset2, int partlen) {
  2525. if (offset < 0) return false;
  2526. if (limit-offset < partlen)
  2527. return false;
  2528. int i = offset2;
  2529. while (partlen-- > 0) {
  2530. if ( target .setIndex( offset++ ) != target .setIndex( i++ ) )
  2531. return false;
  2532. }
  2533. return true;
  2534. }
  2535. /**
  2536. * @see java.lang.String#regionMatches
  2537. */
  2538. private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit,
  2539. String part, int partlen) {
  2540. if (offset < 0) return false;
  2541. if (limit-offset < partlen)
  2542. return false;
  2543. int i = 0;
  2544. while (partlen-- > 0) {
  2545. char ch1 = target .setIndex( offset++ ) ;
  2546. char ch2 = part.charAt(i++);
  2547. if (ch1 == ch2)
  2548. continue;
  2549. char uch1 = Character.toUpperCase(ch1);
  2550. char uch2 = Character.toUpperCase(ch2);
  2551. if (uch1 == uch2)
  2552. continue;
  2553. if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
  2554. return false;
  2555. }
  2556. return true;
  2557. }
  2558. private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit,
  2559. int offset2, int partlen) {
  2560. if (offset < 0) return false;
  2561. if (limit-offset < partlen)
  2562. return false;
  2563. int i = offset2;
  2564. while (partlen-- > 0) {
  2565. char ch1 = target .setIndex( offset++ ) ;
  2566. char ch2 = target .setIndex( i++ ) ;
  2567. if (ch1 == ch2)
  2568. continue;
  2569. char uch1 = Character.toUpperCase(ch1);
  2570. char uch2 = Character.toUpperCase(ch2);
  2571. if (uch1 == uch2)
  2572. continue;
  2573. if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
  2574. return false;
  2575. }
  2576. return true;
  2577. }
  2578. // ================================================================
  2579. /**
  2580. * A regular expression.
  2581. * @serial
  2582. */
  2583. String regex;
  2584. /**
  2585. * @serial
  2586. */
  2587. int options;
  2588. /**
  2589. * The number of parenthesis in the regular expression.
  2590. * @serial
  2591. */
  2592. int nofparen;
  2593. /**
  2594. * Internal representation of the regular expression.
  2595. * @serial
  2596. */
  2597. Token tokentree;
  2598. boolean hasBackReferences = false;
  2599. transient int minlength;
  2600. transient Op operations = null;
  2601. transient int numberOfClosures;
  2602. transient Context context = null;
  2603. transient RangeToken firstChar = null;
  2604. transient String fixedString = null;
  2605. transient int fixedStringOptions;
  2606. transient BMPattern fixedStringTable = null;
  2607. transient boolean fixedStringOnly = false;
  2608. static final class Context {
  2609. CharacterIterator ciTarget;
  2610. String strTarget;
  2611. char[] charTarget;
  2612. int start;
  2613. int limit;
  2614. int length;
  2615. Match match;
  2616. boolean inuse = false;
  2617. int[] offsets;
  2618. Context() {
  2619. }
  2620. private void resetCommon(int nofclosures) {
  2621. this.length = this.limit-this.start;
  2622. this.inuse = true;
  2623. this.match = null;
  2624. if (this.offsets == null || this.offsets.length != nofclosures)
  2625. this.offsets = new int[nofclosures];
  2626. for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1;
  2627. }
  2628. void reset(CharacterIterator target, int start, int limit, int nofclosures) {
  2629. this.ciTarget = target;
  2630. this.start = start;
  2631. this.limit = limit;
  2632. this.resetCommon(nofclosures);
  2633. }
  2634. void reset(String target, int start, int limit, int nofclosures) {
  2635. this.strTarget = target;
  2636. this.start = start;
  2637. this.limit = limit;
  2638. this.resetCommon(nofclosures);
  2639. }
  2640. void reset(char[] target, int start, int limit, int nofclosures) {
  2641. this.charTarget = target;
  2642. this.start = start;
  2643. this.limit = limit;
  2644. this.resetCommon(nofclosures);
  2645. }
  2646. }
  2647. /**
  2648. * Prepares for matching. This method is called just before starting matching.
  2649. */
  2650. void prepare() {
  2651. if (Op.COUNT) Op.nofinstances = 0;
  2652. this.compile(this.tokentree);
  2653. /*
  2654. if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
  2655. Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
  2656. anchor.next = this.operations;
  2657. this.operations = anchor;
  2658. }
  2659. */
  2660. if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
  2661. this.minlength = this.tokentree.getMinLength();
  2662. this.firstChar = null;
  2663. if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
  2664. && !isSet(this.options, XMLSCHEMA_MODE)) {
  2665. RangeToken firstChar = Token.createRange();
  2666. int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
  2667. if (fresult == Token.FC_TERMINAL) {
  2668. firstChar.compactRanges();
  2669. this.firstChar = firstChar;
  2670. if (DEBUG)
  2671. System.err.println("DEBUG: Use the first character optimization: "+firstChar);
  2672. }
  2673. }
  2674. if (this.operations != null
  2675. && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
  2676. && this.operations.next == null) {
  2677. if (DEBUG)
  2678. System.err.print(" *** Only fixed string! *** ");
  2679. this.fixedStringOnly = true;
  2680. if (this.operations.type == Op.STRING)
  2681. this.fixedString = this.operations.getString();
  2682. else if (this.operations.getData() >= 0x10000) { // Op.CHAR
  2683. this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
  2684. } else {
  2685. char[] ac = new char[1];
  2686. ac[0] = (char)this.operations.getData();
  2687. this.fixedString = new String(ac);
  2688. }
  2689. this.fixedStringOptions = this.options;
  2690. this.fixedStringTable = new BMPattern(this.fixedString, 256,
  2691. isSet(this.fixedStringOptions, IGNORE_CASE));
  2692. } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
  2693. && !isSet(this.options, XMLSCHEMA_MODE)) {
  2694. Token.FixedStringContainer container = new Token.FixedStringContainer();
  2695. this.tokentree.findFixedString(container, this.options);
  2696. this.fixedString = container.token == null ? null : container.token.getString();
  2697. this.fixedStringOptions = container.options;
  2698. if (this.fixedString != null && this.fixedString.length() < 2)
  2699. this.fixedString = null;
  2700. // This pattern has a fixed string of which length is more than one.
  2701. if (this.fixedString != null) {
  2702. this.fixedStringTable = new BMPattern(this.fixedString, 256,
  2703. isSet(this.fixedStringOptions, IGNORE_CASE));
  2704. if (DEBUG) {
  2705. System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
  2706. +"/" //+this.fixedString
  2707. +"/"+REUtil.createOptionString(this.fixedStringOptions));
  2708. System.err.print("String: ");
  2709. REUtil.dumpString(this.fixedString);
  2710. }
  2711. }
  2712. }
  2713. }
  2714. /**
  2715. * An option.
  2716. * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
  2717. * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
  2718. * does not capture.
  2719. *
  2720. * @see #RegularExpression(java.lang.String,int)
  2721. * @see #setPattern(java.lang.String,int)
  2722. static final int MARK_PARENS = 1<<0;
  2723. */
  2724. /**
  2725. * "i"
  2726. */
  2727. static final int IGNORE_CASE = 1<<1;
  2728. /**
  2729. * "s"
  2730. */
  2731. static final int SINGLE_LINE = 1<<2;
  2732. /**
  2733. * "m"
  2734. */
  2735. static final int MULTIPLE_LINES = 1<<3;
  2736. /**
  2737. * "x"
  2738. */
  2739. static final int EXTENDED_COMMENT = 1<<4;
  2740. /**
  2741. * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
  2742. *
  2743. * @see #RegularExpression(java.lang.String,int)
  2744. * @see #setPattern(java.lang.String,int)
  2745. * @see #UNICODE_WORD_BOUNDARY
  2746. */
  2747. static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
  2748. /**
  2749. * An option.
  2750. * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>.
  2751. * <p>By default, the engine considers a position between a word character
  2752. * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
  2753. * is a word boundary.
  2754. * <p>By this option, the engine checks word boundaries with the method of
  2755. * 'Unicode Regular Expression Guidelines' Revision 4.
  2756. *
  2757. * @see #RegularExpression(java.lang.String,int)
  2758. * @see #setPattern(java.lang.String,int)
  2759. */
  2760. static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
  2761. /**
  2762. * "H"
  2763. */
  2764. static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
  2765. /**
  2766. * "F"
  2767. */
  2768. static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
  2769. /**
  2770. * "X". XML Schema mode.
  2771. */
  2772. static final int XMLSCHEMA_MODE = 1<<9;
  2773. /**
  2774. * ",".
  2775. */
  2776. static final int SPECIAL_COMMA = 1<<10;
  2777. private static final boolean isSet(int options, int flag) {
  2778. return (options & flag) == flag;
  2779. }
  2780. /**
  2781. * Creates a new RegularExpression instance.
  2782. *
  2783. * @param regex A regular expression
  2784. * @exception com.sun.org.apache.xerces.internal.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
  2785. */
  2786. public RegularExpression(String regex) throws ParseException {
  2787. this.setPattern(regex, null);
  2788. }
  2789. /**
  2790. * Creates a new RegularExpression instance with options.
  2791. *
  2792. * @param regex A regular expression
  2793. * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
  2794. * @exception com.sun.org.apache.xerces.internal.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
  2795. */
  2796. public RegularExpression(String regex, String options) throws ParseException {
  2797. this.setPattern(regex, options);
  2798. }
  2799. RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
  2800. this.regex = regex;
  2801. this.tokentree = tok;
  2802. this.nofparen = parens;
  2803. this.options = options;
  2804. this.hasBackReferences = hasBackReferences;
  2805. }
  2806. /**
  2807. *
  2808. */
  2809. public void setPattern(String newPattern) throws ParseException {
  2810. this.setPattern(newPattern, this.options);
  2811. }
  2812. private void setPattern(String newPattern, int options) throws ParseException {
  2813. this.regex = newPattern;
  2814. this.options = options;
  2815. RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
  2816. ? new ParserForXMLSchema() : new RegexParser();
  2817. this.tokentree = rp.parse(this.regex, this.options);
  2818. this.nofparen = rp.parennumber;
  2819. this.hasBackReferences = rp.hasBackReferences;
  2820. this.operations = null;
  2821. this.context = null;
  2822. }
  2823. /**
  2824. *
  2825. */
  2826. public void setPattern(String newPattern, String options) throws ParseException {
  2827. this.setPattern(newPattern, REUtil.parseOptions(options));
  2828. }
  2829. /**
  2830. *
  2831. */
  2832. public String getPattern() {
  2833. return this.regex;
  2834. }
  2835. /**
  2836. * Represents this instence in String.
  2837. */
  2838. public String toString() {
  2839. return this.tokentree.toString(this.options);
  2840. }
  2841. /**
  2842. * Returns a option string.
  2843. * The order of letters in it may be different from a string specified
  2844. * in a constructor or <code>setPattern()</code>.
  2845. *
  2846. * @see #RegularExpression(java.lang.String,java.lang.String)
  2847. * @see #setPattern(java.lang.String,java.lang.String)
  2848. */
  2849. public String getOptions() {
  2850. return REUtil.createOptionString(this.options);
  2851. }
  2852. /**
  2853. * Return true if patterns are the same and the options are equivalent.
  2854. */
  2855. public boolean equals(Object obj) {
  2856. if (obj == null) return false;
  2857. if (!(obj instanceof RegularExpression))
  2858. return false;
  2859. RegularExpression r = (RegularExpression)obj;
  2860. return this.regex.equals(r.regex) && this.options == r.options;
  2861. }
  2862. boolean equals(String pattern, int options) {
  2863. return this.regex.equals(pattern) && this.options == options;
  2864. }
  2865. /**
  2866. *
  2867. */
  2868. public int hashCode() {
  2869. return (this.regex+"/"+this.getOptions()).hashCode();
  2870. }
  2871. /**
  2872. * Return the number of regular expression groups.
  2873. * This method returns 1 when the regular expression has no capturing-parenthesis.
  2874. *
  2875. */
  2876. public int getNumberOfGroups() {
  2877. return this.nofparen;
  2878. }
  2879. // ================================================================
  2880. private static final int WT_IGNORE = 0;
  2881. private static final int WT_LETTER = 1;
  2882. private static final int WT_OTHER = 2;
  2883. private static final int getWordType0(char ch, int opts) {
  2884. if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
  2885. if (isSet(opts, USE_UNICODE_CATEGORY)) {
  2886. return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
  2887. }
  2888. return isWordChar(ch) ? WT_LETTER : WT_OTHER;
  2889. }
  2890. switch (Character.getType(ch)) {
  2891. case Character.UPPERCASE_LETTER: // L
  2892. case Character.LOWERCASE_LETTER: // L
  2893. case Character.TITLECASE_LETTER: // L
  2894. case Character.MODIFIER_LETTER: // L
  2895. case Character.OTHER_LETTER: // L
  2896. case Character.LETTER_NUMBER: // N
  2897. case Character.DECIMAL_DIGIT_NUMBER: // N
  2898. case Character.OTHER_NUMBER: // N
  2899. case Character.COMBINING_SPACING_MARK: // Mc
  2900. return WT_LETTER;
  2901. case Character.FORMAT: // Cf
  2902. case Character.NON_SPACING_MARK: // Mn
  2903. case Character.ENCLOSING_MARK: // Mc
  2904. return WT_IGNORE;
  2905. case Character.CONTROL: // Cc
  2906. switch (ch) {
  2907. case '\t':
  2908. case '\n':
  2909. case '\u000B':
  2910. case '\f':
  2911. case '\r':
  2912. return WT_OTHER;
  2913. default:
  2914. return WT_IGNORE;
  2915. }
  2916. default:
  2917. return WT_OTHER;
  2918. }
  2919. }
  2920. // ================================================================
  2921. static final int LINE_FEED = 0x000A;
  2922. static final int CARRIAGE_RETURN = 0x000D;
  2923. static final int LINE_SEPARATOR = 0x2028;
  2924. static final int PARAGRAPH_SEPARATOR = 0x2029;
  2925. private static final boolean isEOLChar(int ch) {
  2926. return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
  2927. || ch == PARAGRAPH_SEPARATOR;
  2928. }
  2929. private static final boolean isWordChar(int ch) { // Legacy word characters
  2930. if (ch == '_') return true;
  2931. if (ch < '0') return false;
  2932. if (ch > 'z') return false;
  2933. if (ch <= '9') return true;
  2934. if (ch < 'A') return false;
  2935. if (ch <= 'Z') return true;
  2936. if (ch < 'a') return false;
  2937. return true;
  2938. }
  2939. private static final boolean matchIgnoreCase(int chardata, int ch) {
  2940. if (chardata == ch) return true;
  2941. if (chardata > 0xffff || ch > 0xffff) return false;
  2942. char uch1 = Character.toUpperCase((char)chardata);
  2943. char uch2 = Character.toUpperCase((char)ch);
  2944. if (uch1 == uch2) return true;
  2945. return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
  2946. }
  2947. }