1. package com.sun.org.apache.regexp.internal;
  2. /*
  3. * ====================================================================
  4. *
  5. * The Apache Software License, Version 1.1
  6. *
  7. * Copyright (c) 1999 The Apache Software Foundation. All rights
  8. * reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * 2. Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in
  19. * the documentation and/or other materials provided with the
  20. * distribution.
  21. *
  22. * 3. The end-user documentation included with the redistribution, if
  23. * any, must include the following acknowlegement:
  24. * "This product includes software developed by the
  25. * Apache Software Foundation (http://www.apache.org/)."
  26. * Alternately, this acknowlegement may appear in the software itself,
  27. * if and wherever such third-party acknowlegements normally appear.
  28. *
  29. * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
  30. * Foundation" must not be used to endorse or promote products derived
  31. * from this software without prior written permission. For written
  32. * permission, please contact apache@apache.org.
  33. *
  34. * 5. Products derived from this software may not be called "Apache"
  35. * nor may "Apache" appear in their names without prior written
  36. * permission of the Apache Group.
  37. *
  38. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  39. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  41. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  42. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  43. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  44. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  45. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  46. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  48. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  49. * SUCH DAMAGE.
  50. * ====================================================================
  51. *
  52. * This software consists of voluntary contributions made by many
  53. * individuals on behalf of the Apache Software Foundation. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. *
  57. */
  58. import com.sun.org.apache.regexp.internal.RE;
  59. import java.util.Hashtable;
  60. /**
  61. * A regular expression compiler class. This class compiles a pattern string into a
  62. * regular expression program interpretable by the RE evaluator class. The 'recompile'
  63. * command line tool uses this compiler to pre-compile regular expressions for use
  64. * with RE. For a description of the syntax accepted by RECompiler and what you can
  65. * do with regular expressions, see the documentation for the RE matcher class.
  66. *
  67. * @see RE
  68. * @see recompile
  69. *
  70. * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
  71. * @version $Id: RECompiler.java,v 1.2 2000/05/14 21:04:17 jon Exp $
  72. */
  73. public class RECompiler
  74. {
  75. // The compiled program
  76. char[] instruction; // The compiled RE 'program' instruction buffer
  77. int lenInstruction; // The amount of the program buffer currently in use
  78. // Input state for compiling regular expression
  79. String pattern; // Input string
  80. int len; // Length of the pattern string
  81. int idx; // Current input index into ac
  82. int parens; // Total number of paren pairs
  83. // Node flags
  84. static final int NODE_NORMAL = 0; // No flags (nothing special)
  85. static final int NODE_NULLABLE = 1; // True if node is potentially null
  86. static final int NODE_TOPLEVEL = 2; // True if top level expr
  87. // Special types of 'escapes'
  88. static final char ESC_MASK = 0xfff0; // Escape complexity mask
  89. static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
  90. static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
  91. static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
  92. // {m,n} stacks
  93. static final int maxBrackets = 10; // Maximum number of bracket pairs
  94. static int brackets = 0; // Number of bracket sets
  95. static int[] bracketStart = null; // Starting point
  96. static int[] bracketEnd = null; // Ending point
  97. static int[] bracketMin = null; // Minimum number of matches
  98. static int[] bracketOpt = null; // Additional optional matches
  99. static final int bracketUnbounded = -1; // Unbounded value
  100. static final int bracketFinished = -2; // Unbounded value
  101. // Lookup table for POSIX character class names
  102. static Hashtable hashPOSIX = new Hashtable();
  103. static
  104. {
  105. hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
  106. hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
  107. hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
  108. hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
  109. hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
  110. hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
  111. hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
  112. hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
  113. hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
  114. hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
  115. hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
  116. hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
  117. hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
  118. hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
  119. }
  120. /**
  121. * Constructor. Creates (initially empty) storage for a regular expression program.
  122. */
  123. public RECompiler()
  124. {
  125. // Start off with a generous, yet reasonable, initial size
  126. instruction = new char[128];
  127. lenInstruction = 0;
  128. }
  129. /**
  130. * Ensures that n more characters can fit in the program buffer.
  131. * If n more can't fit, then the size is doubled until it can.
  132. * @param n Number of additional characters to ensure will fit.
  133. */
  134. void ensure(int n)
  135. {
  136. // Get current program length
  137. int curlen = instruction.length;
  138. // If the current length + n more is too much
  139. if (lenInstruction + n >= curlen)
  140. {
  141. // Double the size of the program array until n more will fit
  142. while (lenInstruction + n >= curlen)
  143. {
  144. curlen *= 2;
  145. }
  146. // Allocate new program array and move data into it
  147. char[] newInstruction = new char[curlen];
  148. System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
  149. instruction = newInstruction;
  150. }
  151. }
  152. /**
  153. * Emit a single character into the program stream.
  154. * @param c Character to add
  155. */
  156. void emit(char c)
  157. {
  158. // Make room for character
  159. ensure(1);
  160. // Add character
  161. instruction[lenInstruction++] = c;
  162. }
  163. /**
  164. * Inserts a node with a given opcode and opdata at insertAt. The node relative next
  165. * pointer is initialized to 0.
  166. * @param opcode Opcode for new node
  167. * @param opdata Opdata for new node (only the low 16 bits are currently used)
  168. * @param insertAt Index at which to insert the new node in the program
  169. */
  170. void nodeInsert(char opcode, int opdata, int insertAt)
  171. {
  172. // Make room for a new node
  173. ensure(RE.nodeSize);
  174. // Move everything from insertAt to the end down nodeSize elements
  175. System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
  176. instruction[insertAt + RE.offsetOpcode] = opcode;
  177. instruction[insertAt + RE.offsetOpdata] = (char)opdata;
  178. instruction[insertAt + RE.offsetNext] = 0;
  179. lenInstruction += RE.nodeSize;
  180. }
  181. /**
  182. * Appends a node to the end of a node chain
  183. * @param node Start of node chain to traverse
  184. * @param pointTo Node to have the tail of the chain point to
  185. */
  186. void setNextOfEnd(int node, int pointTo)
  187. {
  188. // Traverse the chain until the next offset is 0
  189. int next;
  190. while ((next = instruction[node + RE.offsetNext]) != 0)
  191. {
  192. node += next;
  193. }
  194. // Point the last node in the chain to pointTo.
  195. instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
  196. }
  197. /**
  198. * Adds a new node
  199. * @param opcode Opcode for node
  200. * @param opdata Opdata for node (only the low 16 bits are currently used)
  201. * @return Index of new node in program
  202. */
  203. int node(char opcode, int opdata)
  204. {
  205. // Make room for a new node
  206. ensure(RE.nodeSize);
  207. // Add new node at end
  208. instruction[lenInstruction + RE.offsetOpcode] = opcode;
  209. instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
  210. instruction[lenInstruction + RE.offsetNext] = 0;
  211. lenInstruction += RE.nodeSize;
  212. // Return index of new node
  213. return lenInstruction - RE.nodeSize;
  214. }
  215. /**
  216. * Throws a new internal error exception
  217. * @exception Error Thrown in the event of an internal error.
  218. */
  219. void internalError() throws Error
  220. {
  221. throw new Error("Internal error!");
  222. }
  223. /**
  224. * Throws a new syntax error exception
  225. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  226. */
  227. void syntaxError(String s) throws RESyntaxException
  228. {
  229. throw new RESyntaxException(s);
  230. }
  231. /**
  232. * Allocate storage for brackets only as needed
  233. */
  234. void allocBrackets()
  235. {
  236. // Allocate bracket stacks if not already done
  237. if (bracketStart == null)
  238. {
  239. // Allocate storage
  240. bracketStart = new int[maxBrackets];
  241. bracketEnd = new int[maxBrackets];
  242. bracketMin = new int[maxBrackets];
  243. bracketOpt = new int[maxBrackets];
  244. // Initialize to invalid values
  245. for (int i = 0; i < maxBrackets; i++)
  246. {
  247. bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
  248. }
  249. }
  250. }
  251. /**
  252. * Match bracket {m,n} expression put results in bracket member variables
  253. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  254. */
  255. void bracket() throws RESyntaxException
  256. {
  257. // Current character must be a '{'
  258. if (idx >= len || pattern.charAt(idx++) != '{')
  259. {
  260. internalError();
  261. }
  262. // Next char must be a digit
  263. if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
  264. {
  265. syntaxError("Expected digit");
  266. }
  267. // Get min ('m' of {m,n}) number
  268. StringBuffer number = new StringBuffer();
  269. while (idx < len && Character.isDigit(pattern.charAt(idx)))
  270. {
  271. number.append(pattern.charAt(idx++));
  272. }
  273. try
  274. {
  275. bracketMin[brackets] = Integer.parseInt(number.toString());
  276. }
  277. catch (NumberFormatException e)
  278. {
  279. syntaxError("Expected valid number");
  280. }
  281. // If out of input, fail
  282. if (idx >= len)
  283. {
  284. syntaxError("Expected comma or right bracket");
  285. }
  286. // If end of expr, optional limit is 0
  287. if (pattern.charAt(idx) == '}')
  288. {
  289. idx++;
  290. bracketOpt[brackets] = 0;
  291. return;
  292. }
  293. // Must have at least {m,} and maybe {m,n}.
  294. if (idx >= len || pattern.charAt(idx++) != ',')
  295. {
  296. syntaxError("Expected comma");
  297. }
  298. // If out of input, fail
  299. if (idx >= len)
  300. {
  301. syntaxError("Expected comma or right bracket");
  302. }
  303. // If {m,} max is unlimited
  304. if (pattern.charAt(idx) == '}')
  305. {
  306. idx++;
  307. bracketOpt[brackets] = bracketUnbounded;
  308. return;
  309. }
  310. // Next char must be a digit
  311. if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
  312. {
  313. syntaxError("Expected digit");
  314. }
  315. // Get max number
  316. number.setLength(0);
  317. while (idx < len && Character.isDigit(pattern.charAt(idx)))
  318. {
  319. number.append(pattern.charAt(idx++));
  320. }
  321. try
  322. {
  323. bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
  324. }
  325. catch (NumberFormatException e)
  326. {
  327. syntaxError("Expected valid number");
  328. }
  329. // Optional repetitions must be > 0
  330. if (bracketOpt[brackets] <= 0)
  331. {
  332. syntaxError("Bad range");
  333. }
  334. // Must have close brace
  335. if (idx >= len || pattern.charAt(idx++) != '}')
  336. {
  337. syntaxError("Missing close brace");
  338. }
  339. }
  340. /**
  341. * Match an escape sequence. Handles quoted chars and octal escapes as well
  342. * as normal escape characters. Always advances the input stream by the
  343. * right amount. This code "understands" the subtle difference between an
  344. * octal escape and a backref. You can access the type of ESC_CLASS or
  345. * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
  346. * @return ESC_* code or character if simple escape
  347. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  348. */
  349. char escape() throws RESyntaxException
  350. {
  351. // "Shouldn't" happen
  352. if (pattern.charAt(idx) != '\\')
  353. {
  354. internalError();
  355. }
  356. // Escape shouldn't occur as last character in string!
  357. if (idx + 1 == len)
  358. {
  359. syntaxError("Escape terminates string");
  360. }
  361. // Switch on character after backslash
  362. idx += 2;
  363. char escapeChar = pattern.charAt(idx - 1);
  364. switch (escapeChar)
  365. {
  366. case RE.E_BOUND:
  367. case RE.E_NBOUND:
  368. return ESC_COMPLEX;
  369. case RE.E_ALNUM:
  370. case RE.E_NALNUM:
  371. case RE.E_SPACE:
  372. case RE.E_NSPACE:
  373. case RE.E_DIGIT:
  374. case RE.E_NDIGIT:
  375. return ESC_CLASS;
  376. case 'u':
  377. case 'x':
  378. {
  379. // Exact required hex digits for escape type
  380. int hexDigits = (escapeChar == 'u' ? 4 : 2);
  381. // Parse up to hexDigits characters from input
  382. int val = 0;
  383. for ( ; idx < len && hexDigits-- > 0; idx++)
  384. {
  385. // Get char
  386. char c = pattern.charAt(idx);
  387. // If it's a hexadecimal digit (0-9)
  388. if (c >= '0' && c <= '9')
  389. {
  390. // Compute new value
  391. val = (val << 4) + c - '0';
  392. }
  393. else
  394. {
  395. // If it's a hexadecimal letter (a-f)
  396. c = Character.toLowerCase(c);
  397. if (c >= 'a' && c <= 'f')
  398. {
  399. // Compute new value
  400. val = (val << 4) + (c - 'a') + 10;
  401. }
  402. else
  403. {
  404. // If it's not a valid digit or hex letter, the escape must be invalid
  405. // because hexDigits of input have not been absorbed yet.
  406. syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
  407. }
  408. }
  409. }
  410. return (char)val;
  411. }
  412. case 't':
  413. return '\t';
  414. case 'n':
  415. return '\n';
  416. case 'r':
  417. return '\r';
  418. case 'f':
  419. return '\f';
  420. case '0':
  421. case '1':
  422. case '2':
  423. case '3':
  424. case '4':
  425. case '5':
  426. case '6':
  427. case '7':
  428. case '8':
  429. case '9':
  430. // An octal escape starts with a 0 or has two digits in a row
  431. if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
  432. {
  433. // Handle \nnn octal escapes
  434. int val = escapeChar - '0';
  435. if (idx < len && Character.isDigit(pattern.charAt(idx)))
  436. {
  437. val = ((val << 3) + (pattern.charAt(idx++) - '0'));
  438. if (idx < len && Character.isDigit(pattern.charAt(idx)))
  439. {
  440. val = ((val << 3) + (pattern.charAt(idx++) - '0'));
  441. }
  442. }
  443. return (char)val;
  444. }
  445. // It's actually a backreference (\[1-9]), not an escape
  446. return ESC_BACKREF;
  447. default:
  448. // Simple quoting of a character
  449. return escapeChar;
  450. }
  451. }
  452. /**
  453. * Compile a character class
  454. * @return Index of class node
  455. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  456. */
  457. int characterClass() throws RESyntaxException
  458. {
  459. // Check for bad calling or empty class
  460. if (pattern.charAt(idx) != '[')
  461. {
  462. internalError();
  463. }
  464. // Check for unterminated or empty class
  465. if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
  466. {
  467. syntaxError("Empty or unterminated class");
  468. }
  469. // Check for POSIX character class
  470. if (idx < len && pattern.charAt(idx) == ':')
  471. {
  472. // Skip colon
  473. idx++;
  474. // POSIX character classes are denoted with lowercase ASCII strings
  475. int idxStart = idx;
  476. while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
  477. {
  478. idx++;
  479. }
  480. // Should be a ":]" to terminate the POSIX character class
  481. if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
  482. {
  483. // Get character class
  484. String charClass = pattern.substring(idxStart, idx);
  485. // Select the POSIX class id
  486. Character i = (Character)hashPOSIX.get(charClass);
  487. if (i != null)
  488. {
  489. // Move past colon and right bracket
  490. idx += 2;
  491. // Return new POSIX character class node
  492. return node(RE.OP_POSIXCLASS, i.charValue());
  493. }
  494. syntaxError("Invalid POSIX character class '" + charClass + "'");
  495. }
  496. syntaxError("Invalid POSIX character class syntax");
  497. }
  498. // Try to build a class. Create OP_ANYOF node
  499. int ret = node(RE.OP_ANYOF, 0);
  500. // Parse class declaration
  501. char CHAR_INVALID = Character.MAX_VALUE;
  502. char last = CHAR_INVALID;
  503. char simpleChar = 0;
  504. boolean include = true;
  505. boolean definingRange = false;
  506. int idxFirst = idx;
  507. char rangeStart = Character.MIN_VALUE;
  508. char rangeEnd;
  509. RERange range = new RERange();
  510. while (idx < len && pattern.charAt(idx) != ']')
  511. {
  512. switchOnCharacter:
  513. // Switch on character
  514. switch (pattern.charAt(idx))
  515. {
  516. case '^':
  517. include = !include;
  518. if (idx == idxFirst)
  519. {
  520. range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
  521. }
  522. idx++;
  523. continue;
  524. case '\\':
  525. {
  526. // Escape always advances the stream
  527. char c;
  528. switch (c = escape ())
  529. {
  530. case ESC_COMPLEX:
  531. case ESC_BACKREF:
  532. // Word boundaries and backrefs not allowed in a character class!
  533. syntaxError("Bad character class");
  534. case ESC_CLASS:
  535. // Classes can't be an endpoint of a range
  536. if (definingRange)
  537. {
  538. syntaxError("Bad character class");
  539. }
  540. // Handle specific type of class (some are ok)
  541. switch (pattern.charAt(idx - 1))
  542. {
  543. case RE.E_NSPACE:
  544. case RE.E_NDIGIT:
  545. case RE.E_NALNUM:
  546. syntaxError("Bad character class");
  547. case RE.E_SPACE:
  548. range.include('\t', include);
  549. range.include('\r', include);
  550. range.include('\f', include);
  551. range.include('\n', include);
  552. range.include('\b', include);
  553. range.include(' ', include);
  554. break;
  555. case RE.E_ALNUM:
  556. range.include('a', 'z', include);
  557. range.include('A', 'Z', include);
  558. range.include('_', include);
  559. // Fall through!
  560. case RE.E_DIGIT:
  561. range.include('0', '9', include);
  562. break;
  563. }
  564. // Make last char invalid (can't be a range start)
  565. last = CHAR_INVALID;
  566. break;
  567. default:
  568. // Escape is simple so treat as a simple char
  569. simpleChar = c;
  570. break switchOnCharacter;
  571. }
  572. }
  573. continue;
  574. case '-':
  575. // Start a range if one isn't already started
  576. if (definingRange)
  577. {
  578. syntaxError("Bad class range");
  579. }
  580. definingRange = true;
  581. // If no last character, start of range is 0
  582. rangeStart = (last == CHAR_INVALID ? 0 : last);
  583. // Premature end of range. define up to Character.MAX_VALUE
  584. if ((idx + 1) < len && pattern.charAt(++idx) == ']')
  585. {
  586. simpleChar = Character.MAX_VALUE;
  587. break;
  588. }
  589. continue;
  590. default:
  591. simpleChar = pattern.charAt(idx++);
  592. break;
  593. }
  594. // Handle simple character simpleChar
  595. if (definingRange)
  596. {
  597. // if we are defining a range make it now
  598. rangeEnd = simpleChar;
  599. // Actually create a range if the range is ok
  600. if (rangeStart >= rangeEnd)
  601. {
  602. syntaxError("Bad character class");
  603. }
  604. range.include(rangeStart, rangeEnd, include);
  605. // We are done defining the range
  606. last = CHAR_INVALID;
  607. definingRange = false;
  608. }
  609. else
  610. {
  611. // If simple character and not start of range, include it
  612. if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
  613. {
  614. range.include(simpleChar, include);
  615. }
  616. last = simpleChar;
  617. }
  618. }
  619. // Shouldn't be out of input
  620. if (idx == len)
  621. {
  622. syntaxError("Unterminated character class");
  623. }
  624. // Absorb the ']' end of class marker
  625. idx++;
  626. // Emit character class definition
  627. instruction[ret + RE.offsetOpdata] = (char)range.num;
  628. for (int i = 0; i < range.num; i++)
  629. {
  630. emit((char)range.minRange[i]);
  631. emit((char)range.maxRange[i]);
  632. }
  633. return ret;
  634. }
  635. /**
  636. * Absorb an atomic character string. This method is a little tricky because
  637. * it can un-include the last character of string if a closure operator follows.
  638. * This is correct because *+? have higher precedence than concatentation (thus
  639. * ABC* means AB(C*) and NOT (ABC)*).
  640. * @return Index of new atom node
  641. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  642. */
  643. int atom() throws RESyntaxException
  644. {
  645. // Create a string node
  646. int ret = node(RE.OP_ATOM, 0);
  647. // Length of atom
  648. int lenAtom = 0;
  649. // Loop while we've got input
  650. atomLoop:
  651. while (idx < len)
  652. {
  653. // Is there a next char?
  654. if ((idx + 1) < len)
  655. {
  656. char c = pattern.charAt(idx + 1);
  657. // If the next 'char' is an escape, look past the whole escape
  658. if (pattern.charAt(idx) == '\\')
  659. {
  660. int idxEscape = idx;
  661. escape();
  662. if (idx < len)
  663. {
  664. c = pattern.charAt(idx);
  665. }
  666. idx = idxEscape;
  667. }
  668. // Switch on next char
  669. switch (c)
  670. {
  671. case '{':
  672. case '?':
  673. case '*':
  674. case '+':
  675. // If the next character is a closure operator and our atom is non-empty, the
  676. // current character should bind to the closure operator rather than the atom
  677. if (lenAtom != 0)
  678. {
  679. break atomLoop;
  680. }
  681. }
  682. }
  683. // Switch on current char
  684. switch (pattern.charAt(idx))
  685. {
  686. case ']':
  687. case '^':
  688. case '$':
  689. case '.':
  690. case '[':
  691. case '(':
  692. case ')':
  693. case '|':
  694. break atomLoop;
  695. case '{':
  696. case '?':
  697. case '*':
  698. case '+':
  699. // We should have an atom by now
  700. if (lenAtom == 0)
  701. {
  702. // No atom before closure
  703. syntaxError("Missing operand to closure");
  704. }
  705. break atomLoop;
  706. case '\\':
  707. {
  708. // Get the escaped character (advances input automatically)
  709. int idxBeforeEscape = idx;
  710. char c = escape();
  711. // Check if it's a simple escape (as opposed to, say, a backreference)
  712. if ((c & ESC_MASK) == ESC_MASK)
  713. {
  714. // Not a simple escape, so backup to where we were before the escape.
  715. idx = idxBeforeEscape;
  716. break atomLoop;
  717. }
  718. // Add escaped char to atom
  719. emit(c);
  720. lenAtom++;
  721. }
  722. break;
  723. default:
  724. // Add normal character to atom
  725. emit(pattern.charAt(idx++));
  726. lenAtom++;
  727. break;
  728. }
  729. }
  730. // This "shouldn't" happen
  731. if (lenAtom == 0)
  732. {
  733. internalError();
  734. }
  735. // Emit the atom length into the program
  736. instruction[ret + RE.offsetOpdata] = (char)lenAtom;
  737. return ret;
  738. }
  739. /**
  740. * Match a terminal node.
  741. * @param flags Flags
  742. * @return Index of terminal node (closeable)
  743. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  744. */
  745. int terminal(int[] flags) throws RESyntaxException
  746. {
  747. switch (pattern.charAt(idx))
  748. {
  749. case RE.OP_EOL:
  750. case RE.OP_BOL:
  751. case RE.OP_ANY:
  752. return node(pattern.charAt(idx++), 0);
  753. case '[':
  754. return characterClass();
  755. case '(':
  756. return expr(flags);
  757. case ')':
  758. syntaxError("Unexpected close paren");
  759. case '|':
  760. internalError();
  761. case ']':
  762. syntaxError("Mismatched class");
  763. case 0:
  764. syntaxError("Unexpected end of input");
  765. case '?':
  766. case '+':
  767. case '{':
  768. case '*':
  769. syntaxError("Missing operand to closure");
  770. case '\\':
  771. {
  772. // Don't forget, escape() advances the input stream!
  773. int idxBeforeEscape = idx;
  774. // Switch on escaped character
  775. switch (escape())
  776. {
  777. case ESC_CLASS:
  778. case ESC_COMPLEX:
  779. flags[0] &= ~NODE_NULLABLE;
  780. return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
  781. case ESC_BACKREF:
  782. {
  783. char backreference = (char)(pattern.charAt(idx - 1) - '0');
  784. if (parens <= backreference)
  785. {
  786. syntaxError("Bad backreference");
  787. }
  788. flags[0] |= NODE_NULLABLE;
  789. return node(RE.OP_BACKREF, backreference);
  790. }
  791. default:
  792. // We had a simple escape and we want to have it end up in
  793. // an atom, so we back up and fall though to the default handling
  794. idx = idxBeforeEscape;
  795. flags[0] &= ~NODE_NULLABLE;
  796. break;
  797. }
  798. }
  799. }
  800. // Everything above either fails or returns.
  801. // If it wasn't one of the above, it must be the start of an atom.
  802. flags[0] &= ~NODE_NULLABLE;
  803. return atom();
  804. }
  805. /**
  806. * Compile a possibly closured terminal
  807. * @param flags Flags passed by reference
  808. * @return Index of closured node
  809. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  810. */
  811. int closure(int[] flags) throws RESyntaxException
  812. {
  813. // Before terminal
  814. int idxBeforeTerminal = idx;
  815. // Values to pass by reference to terminal()
  816. int[] terminalFlags = { NODE_NORMAL };
  817. // Get terminal symbol
  818. int ret = terminal(terminalFlags);
  819. // Or in flags from terminal symbol
  820. flags[0] |= terminalFlags[0];
  821. // Advance input, set NODE_NULLABLE flag and do sanity checks
  822. if (idx >= len)
  823. {
  824. return ret;
  825. }
  826. boolean greedy = true;
  827. char closureType = pattern.charAt(idx);
  828. switch (closureType)
  829. {
  830. case '?':
  831. case '*':
  832. // The current node can be null
  833. flags[0] |= NODE_NULLABLE;
  834. case '+':
  835. // Eat closure character
  836. idx++;
  837. case '{':
  838. // Don't allow blantant stupidity
  839. int opcode = instruction[ret + RE.offsetOpcode];
  840. if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
  841. {
  842. syntaxError("Bad closure operand");
  843. }
  844. if ((terminalFlags[0] & NODE_NULLABLE) != 0)
  845. {
  846. syntaxError("Closure operand can't be nullable");
  847. }
  848. break;
  849. }
  850. // If the next character is a '?', make the closure non-greedy (reluctant)
  851. if (idx < len && pattern.charAt(idx) == '?')
  852. {
  853. idx++;
  854. greedy = false;
  855. }
  856. if (greedy)
  857. {
  858. // Actually do the closure now
  859. switch (closureType)
  860. {
  861. case '{':
  862. {
  863. // We look for our bracket in the list
  864. boolean found = false;
  865. int i;
  866. allocBrackets();
  867. for (i = 0; i < brackets; i++)
  868. {
  869. if (bracketStart[i] == idx)
  870. {
  871. found = true;
  872. break;
  873. }
  874. }
  875. // If its not in the list we parse the {m,n}
  876. if (!found)
  877. {
  878. if (brackets >= maxBrackets)
  879. {
  880. syntaxError("Too many bracketed closures (limit is 10)");
  881. }
  882. bracketStart[brackets] = idx;
  883. bracket();
  884. bracketEnd[brackets] = idx;
  885. i = brackets++;
  886. }
  887. // If there's a min, rewind stream and reparse
  888. if (--bracketMin[i] > 0)
  889. {
  890. // Rewind stream and run it through again
  891. idx = idxBeforeTerminal;
  892. break;
  893. }
  894. // Do the right thing for maximum ({m,})
  895. if (bracketOpt[i] == bracketFinished)
  896. {
  897. // Drop through now and closure expression.
  898. // We are done with the {m,} expr, so skip rest
  899. closureType = '*';
  900. bracketOpt[i] = 0;
  901. idx = bracketEnd[i];
  902. }
  903. else
  904. if (bracketOpt[i] == bracketUnbounded)
  905. {
  906. idx = idxBeforeTerminal;
  907. bracketOpt[i] = bracketFinished;
  908. break;
  909. }
  910. else
  911. if (bracketOpt[i]-- > 0)
  912. {
  913. // Drop through to optionally close and then 'play it again sam!'
  914. idx = idxBeforeTerminal;
  915. closureType = '?';
  916. }
  917. else
  918. {
  919. // We are done. skip the rest of {m,n} expr
  920. idx = bracketEnd[i];
  921. break;
  922. }
  923. }
  924. // Fall through!
  925. case '?':
  926. case '*':
  927. if (!greedy)
  928. {
  929. break;
  930. }
  931. if (closureType == '?')
  932. {
  933. // X? is compiled as (X|)
  934. nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
  935. setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option
  936. int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING
  937. setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
  938. setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
  939. }
  940. if (closureType == '*')
  941. {
  942. // X* is compiled as (X{gotoX}|)
  943. nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
  944. setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
  945. setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
  946. setNextOfEnd(ret + RE.nodeSize, ret); // the start again
  947. setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
  948. setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
  949. }
  950. break;
  951. case '+':
  952. {
  953. // X+ is compiled as X({gotoX}|)
  954. int branch;
  955. branch = node(RE.OP_BRANCH, 0); // a new branch
  956. setNextOfEnd(ret, branch); // is added to the end of X
  957. setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
  958. setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
  959. setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
  960. }
  961. break;
  962. }
  963. }
  964. else
  965. {
  966. // Add end after closured subexpr
  967. setNextOfEnd(ret, node(RE.OP_END, 0));
  968. // Actually do the closure now
  969. switch (closureType)
  970. {
  971. case '?':
  972. nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
  973. break;
  974. case '*':
  975. nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
  976. break;
  977. case '+':
  978. nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
  979. break;
  980. }
  981. // Point to the expr after the closure
  982. setNextOfEnd(ret, lenInstruction);
  983. }
  984. return ret;
  985. }
  986. /**
  987. * Compile one branch of an or operator (implements concatenation)
  988. * @param flags Flags passed by reference
  989. * @return Pointer to branch node
  990. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  991. */
  992. int branch(int[] flags) throws RESyntaxException
  993. {
  994. // Get each possibly closured piece and concat
  995. int node;
  996. int ret = node(RE.OP_BRANCH, 0);
  997. int chain = -1;
  998. int[] closureFlags = new int[1];
  999. boolean nullable = true;
  1000. while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
  1001. {
  1002. // Get new node
  1003. closureFlags[0] = NODE_NORMAL;
  1004. node = closure(closureFlags);
  1005. if (closureFlags[0] == NODE_NORMAL)
  1006. {
  1007. nullable = false;
  1008. }
  1009. // If there's a chain, append to the end
  1010. if (chain != -1)
  1011. {
  1012. setNextOfEnd(chain, node);
  1013. }
  1014. // Chain starts at current
  1015. chain = node;
  1016. }
  1017. // If we don't run loop, make a nothing node
  1018. if (chain == -1)
  1019. {
  1020. node(RE.OP_NOTHING, 0);
  1021. }
  1022. // Set nullable flag for this branch
  1023. if (nullable)
  1024. {
  1025. flags[0] |= NODE_NULLABLE;
  1026. }
  1027. return ret;
  1028. }
  1029. /**
  1030. * Compile an expression with possible parens around it. Paren matching
  1031. * is done at this level so we can tie the branch tails together.
  1032. * @param flags Flag value passed by reference
  1033. * @return Node index of expression in instruction array
  1034. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  1035. */
  1036. int expr(int[] flags) throws RESyntaxException
  1037. {
  1038. // Create open paren node unless we were called from the top level (which has no parens)
  1039. boolean paren = false;
  1040. int ret = -1;
  1041. int closeParens = parens;
  1042. if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
  1043. {
  1044. idx++;
  1045. paren = true;
  1046. ret = node(RE.OP_OPEN, parens++);
  1047. }
  1048. flags[0] &= ~NODE_TOPLEVEL;
  1049. // Create a branch node
  1050. int branch = branch(flags);
  1051. if (ret == -1)
  1052. {
  1053. ret = branch;
  1054. }
  1055. else
  1056. {
  1057. setNextOfEnd(ret, branch);
  1058. }
  1059. // Loop through branches
  1060. while (idx < len && pattern.charAt(idx) == '|')
  1061. {
  1062. idx++;
  1063. branch = branch(flags);
  1064. setNextOfEnd(ret, branch);
  1065. }
  1066. // Create an ending node (either a close paren or an OP_END)
  1067. int end;
  1068. if (paren)
  1069. {
  1070. if (idx < len && pattern.charAt(idx) == ')')
  1071. {
  1072. idx++;
  1073. }
  1074. else
  1075. {
  1076. syntaxError("Missing close paren");
  1077. }
  1078. end = node(RE.OP_CLOSE, closeParens);
  1079. }
  1080. else
  1081. {
  1082. end = node(RE.OP_END, 0);
  1083. }
  1084. // Append the ending node to the ret nodelist
  1085. setNextOfEnd(ret, end);
  1086. // Hook the ends of each branch to the end node
  1087. for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next)
  1088. {
  1089. // If branch, make the end of the branch's operand chain point to the end node.
  1090. if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
  1091. {
  1092. setNextOfEnd(i + RE.nodeSize, end);
  1093. }
  1094. }
  1095. // Return the node list
  1096. return ret;
  1097. }
  1098. /**
  1099. * Compiles a regular expression pattern into a program runnable by the pattern
  1100. * matcher class 'RE'.
  1101. * @param pattern Regular expression pattern to compile (see RECompiler class
  1102. * for details).
  1103. * @return A compiled regular expression program.
  1104. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  1105. * @see RECompiler
  1106. * @see RE
  1107. */
  1108. public REProgram compile(String pattern) throws RESyntaxException
  1109. {
  1110. // Initialize variables for compilation
  1111. this.pattern = pattern; // Save pattern in instance variable
  1112. len = pattern.length(); // Precompute pattern length for speed
  1113. idx = 0; // Set parsing index to the first character
  1114. lenInstruction = 0; // Set emitted instruction count to zero
  1115. parens = 1; // Set paren level to 1 (the implicit outer parens)
  1116. brackets = 0; // No bracketed closures yet
  1117. // Initialize pass by reference flags value
  1118. int[] flags = { NODE_TOPLEVEL };
  1119. // Parse expression
  1120. expr(flags);
  1121. // Should be at end of input
  1122. if (idx != len)
  1123. {
  1124. if (pattern.charAt(idx) == ')')
  1125. {
  1126. syntaxError("Unmatched close paren");
  1127. }
  1128. syntaxError("Unexpected input remains");
  1129. }
  1130. // Return the result
  1131. char[] ins = new char[lenInstruction];
  1132. System.arraycopy(instruction, 0, ins, 0, lenInstruction);
  1133. return new REProgram(ins);
  1134. }
  1135. /**
  1136. * Local, nested class for maintaining character ranges for character classes.
  1137. */
  1138. class RERange
  1139. {
  1140. int size = 16; // Capacity of current range arrays
  1141. int[] minRange = new int[size]; // Range minima
  1142. int[] maxRange = new int[size]; // Range maxima
  1143. int num = 0; // Number of range array elements in use
  1144. /**
  1145. * Deletes the range at a given index from the range lists
  1146. * @param index Index of range to delete from minRange and maxRange arrays.
  1147. */
  1148. void delete(int index)
  1149. {
  1150. // Return if no elements left or index is out of range
  1151. if (num == 0 || index >= num)
  1152. {
  1153. return;
  1154. }
  1155. // Move elements down
  1156. while (index++ < num)
  1157. {
  1158. if (index - 1 >= 0)
  1159. {
  1160. minRange[index-1] = minRange[index];
  1161. maxRange[index-1] = maxRange[index];
  1162. }
  1163. }
  1164. // One less element now
  1165. num--;
  1166. }
  1167. /**
  1168. * Merges a range into the range list, coalescing ranges if possible.
  1169. * @param min Minimum end of range
  1170. * @param max Maximum end of range
  1171. */
  1172. void merge(int min, int max)
  1173. {
  1174. // Loop through ranges
  1175. for (int i = 0; i < num; i++)
  1176. {
  1177. // Min-max is subsumed by minRange[i]-maxRange[i]
  1178. if (min >= minRange[i] && max <= maxRange[i])
  1179. {
  1180. return;
  1181. }
  1182. // Min-max subsumes minRange[i]-maxRange[i]
  1183. else if (min <= minRange[i] && max >= maxRange[i])
  1184. {
  1185. delete(i);
  1186. merge(min, max);
  1187. return;
  1188. }
  1189. // Min is in the range, but max is outside
  1190. else if (min >= minRange[i] && min <= maxRange[i])
  1191. {
  1192. delete(i);
  1193. min = minRange[i];
  1194. merge(min, max);
  1195. return;
  1196. }
  1197. // Max is in the range, but min is outside
  1198. else if (max >= minRange[i] && max <= maxRange[i])
  1199. {
  1200. delete(i);
  1201. max = maxRange[i];
  1202. merge(min, max);
  1203. return;
  1204. }
  1205. }
  1206. // Must not overlap any other ranges
  1207. if (num >= size)
  1208. {
  1209. size *= 2;
  1210. int[] newMin = new int[size];
  1211. int[] newMax = new int[size];
  1212. System.arraycopy(minRange, 0, newMin, 0, num);
  1213. System.arraycopy(maxRange, 0, newMax, 0, num);
  1214. minRange = newMin;
  1215. maxRange = newMax;
  1216. }
  1217. minRange[num] = min;
  1218. maxRange[num] = max;
  1219. num++;
  1220. }
  1221. /**
  1222. * Removes a range by deleting or shrinking all other ranges
  1223. * @param min Minimum end of range
  1224. * @param max Maximum end of range
  1225. */
  1226. void remove(int min, int max)
  1227. {
  1228. // Loop through ranges
  1229. for (int i = 0; i < num; i++)
  1230. {
  1231. // minRange[i]-maxRange[i] is subsumed by min-max
  1232. if (minRange[i] >= min && maxRange[i] <= max)
  1233. {
  1234. delete(i);
  1235. i--;
  1236. return;
  1237. }
  1238. // min-max is subsumed by minRange[i]-maxRange[i]
  1239. else if (min >= minRange[i] && max <= maxRange[i])
  1240. {
  1241. int minr = minRange[i];
  1242. int maxr = maxRange[i];
  1243. delete(i);
  1244. if (minr < min - 1)
  1245. {
  1246. merge(minr, min - 1);
  1247. }
  1248. if (max + 1 < maxr)
  1249. {
  1250. merge(max + 1, maxr);
  1251. }
  1252. return;
  1253. }
  1254. // minRange is in the range, but maxRange is outside
  1255. else if (minRange[i] >= min && minRange[i] <= max)
  1256. {
  1257. minRange[i] = max + 1;
  1258. return;
  1259. }
  1260. // maxRange is in the range, but minRange is outside
  1261. else if (maxRange[i] >= min && maxRange[i] <= max)
  1262. {
  1263. maxRange[i] = min - 1;
  1264. return;
  1265. }
  1266. }
  1267. }
  1268. /**
  1269. * Includes (or excludes) the range from min to max, inclusive.
  1270. * @param min Minimum end of range
  1271. * @param max Maximum end of range
  1272. * @param include True if range should be included. False otherwise.
  1273. */
  1274. void include(int min, int max, boolean include)
  1275. {
  1276. if (include)
  1277. {
  1278. merge(min, max);
  1279. }
  1280. else
  1281. {
  1282. remove(min, max);
  1283. }
  1284. }
  1285. /**
  1286. * Includes a range with the same min and max
  1287. * @param minmax Minimum and maximum end of range (inclusive)
  1288. * @param include True if range should be included. False otherwise.
  1289. */
  1290. void include(char minmax, boolean include)
  1291. {
  1292. include(minmax, minmax, include);
  1293. }
  1294. }
  1295. }