1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl.dtd.models;
  58. import com.sun.org.apache.xerces.internal.xni.QName;
  59. import com.sun.org.apache.xerces.internal.impl.dtd.XMLContentSpec;
  60. /**
  61. * @version $Id: DFAContentModel.java,v 1.4 2002/05/29 17:59:37 neilg Exp $
  62. *
  63. * DFAContentModel is the derivative of ContentModel that does
  64. * all of the non-trivial element content validation. This class does
  65. * the conversion from the regular expression to the DFA that
  66. * it then uses in its validation algorithm.
  67. * <p>
  68. * <b>Note:</b> Upstream work insures that this class will never see
  69. * a content model with PCDATA in it. Any model with PCDATA is 'mixed'
  70. * and is handled via the MixedContentModel class since mixed models
  71. * are very constrained in form and easily handled via a special case.
  72. * This also makes implementation of this class much easier.
  73. *
  74. */
  75. public class DFAContentModel
  76. implements ContentModelValidator {
  77. //
  78. // Constants
  79. //
  80. // special strings
  81. /** Epsilon string. */
  82. private static String fEpsilonString = "<<CMNODE_EPSILON>>";
  83. /** End-of-content string. */
  84. private static String fEOCString = "<<CMNODE_EOC>>";
  85. /** initializing static members **/
  86. static {
  87. fEpsilonString = fEpsilonString.intern();
  88. fEOCString = fEOCString.intern();
  89. }
  90. // debugging
  91. /** Set to true to debug content model validation. */
  92. private static final boolean DEBUG_VALIDATE_CONTENT = false;
  93. //
  94. // Data
  95. //
  96. /* this is the EquivClassComparator object */
  97. //private EquivClassComparator comparator = null;
  98. /**
  99. * This is the map of unique input symbol elements to indices into
  100. * each state's per-input symbol transition table entry. This is part
  101. * of the built DFA information that must be kept around to do the
  102. * actual validation.
  103. */
  104. private QName fElemMap[] = null;
  105. /**
  106. * This is a map of whether the element map contains information
  107. * related to ANY models.
  108. */
  109. private int fElemMapType[] = null;
  110. /** The element map size. */
  111. private int fElemMapSize = 0;
  112. /** Boolean to distinguish Schema Mixed Content */
  113. private boolean fMixed;
  114. /**
  115. * The NFA position of the special EOC (end of content) node. This
  116. * is saved away since it's used during the DFA build.
  117. */
  118. private int fEOCPos = 0;
  119. /**
  120. * This is an array of booleans, one per state (there are
  121. * fTransTableSize states in the DFA) that indicates whether that
  122. * state is a final state.
  123. */
  124. private boolean fFinalStateFlags[] = null;
  125. /**
  126. * The list of follow positions for each NFA position (i.e. for each
  127. * non-epsilon leaf node.) This is only used during the building of
  128. * the DFA, and is let go afterwards.
  129. */
  130. private CMStateSet fFollowList[] = null;
  131. /**
  132. * This is the head node of our intermediate representation. It is
  133. * only non-null during the building of the DFA (just so that it
  134. * does not have to be passed all around.) Once the DFA is built,
  135. * this is no longer required so its nulled out.
  136. */
  137. private CMNode fHeadNode = null;
  138. /**
  139. * The count of leaf nodes. This is an important number that set some
  140. * limits on the sizes of data structures in the DFA process.
  141. */
  142. private int fLeafCount = 0;
  143. /**
  144. * An array of non-epsilon leaf nodes, which is used during the DFA
  145. * build operation, then dropped.
  146. */
  147. private CMLeaf fLeafList[] = null;
  148. /** Array mapping ANY types to the leaf list. */
  149. private int fLeafListType[] = null;
  150. //private ContentLeafNameTypeVector fLeafNameTypeVector = null;
  151. /**
  152. * The string pool of our parser session. This is set during construction
  153. * and kept around.
  154. */
  155. //private StringPool fStringPool = null;
  156. /**
  157. * This is the transition table that is the main by product of all
  158. * of the effort here. It is an array of arrays of ints. The first
  159. * dimension is the number of states we end up with in the DFA. The
  160. * second dimensions is the number of unique elements in the content
  161. * model (fElemMapSize). Each entry in the second dimension indicates
  162. * the new state given that input for the first dimension's start
  163. * state.
  164. * <p>
  165. * The fElemMap array handles mapping from element indexes to
  166. * positions in the second dimension of the transition table.
  167. */
  168. private int fTransTable[][] = null;
  169. /**
  170. * The number of valid entries in the transition table, and in the other
  171. * related tables such as fFinalStateFlags.
  172. */
  173. private int fTransTableSize = 0;
  174. /**
  175. * Flag that indicates that even though we have a "complicated"
  176. * content model, it is valid to have no content. In other words,
  177. * all parts of the content model are optional. For example:
  178. * <pre>
  179. * <!ELEMENT AllOptional (Optional*,NotRequired?)>
  180. * </pre>
  181. */
  182. private boolean fEmptyContentIsValid = false;
  183. // temp variables
  184. /** Temporary qualified name. */
  185. private QName fQName = new QName();
  186. //
  187. // Constructors
  188. //
  189. //
  190. // Constructors
  191. //
  192. /**
  193. * Constructs a DFA content model.
  194. *
  195. * @param syntaxTree The syntax tree of the content model.
  196. * @param leafCount The number of leaves.
  197. * @param dtd if it is created for a DTDGrammar.
  198. *
  199. */
  200. public DFAContentModel(CMNode syntaxTree, int leafCount, boolean mixed) {
  201. // Store away our index and pools in members
  202. //fStringPool = stringPool;
  203. fLeafCount = leafCount;
  204. // this is for Schema Mixed Content
  205. fMixed = mixed;
  206. //
  207. // Ok, so lets grind through the building of the DFA. This method
  208. // handles the high level logic of the algorithm, but it uses a
  209. // number of helper classes to do its thing.
  210. //
  211. // In order to avoid having hundreds of references to the error and
  212. // string handlers around, this guy and all of his helper classes
  213. // just throw a simple exception and we then pass it along.
  214. //
  215. buildDFA(syntaxTree);
  216. }
  217. //
  218. // ContentModelValidator methods
  219. //
  220. /**
  221. * Check that the specified content is valid according to this
  222. * content model. This method can also be called to do 'what if'
  223. * testing of content models just to see if they would be valid.
  224. * <p>
  225. * A value of -1 in the children array indicates a PCDATA node. All other
  226. * indexes will be positive and represent child elements. The count can be
  227. * zero, since some elements have the EMPTY content model and that must be
  228. * confirmed.
  229. *
  230. * @param children The children of this element. Each integer is an index within
  231. * the <code>StringPool</code> of the child element name. An index
  232. * of -1 is used to indicate an occurrence of non-whitespace character
  233. * data.
  234. * @param offset Offset into the array where the children starts.
  235. * @param length The number of entries in the <code>children</code> array.
  236. *
  237. * @return The value -1 if fully valid, else the 0 based index of the child
  238. * that first failed. If the value returned is equal to the number
  239. * of children, then the specified children are valid but additional
  240. * content is required to reach a valid ending state.
  241. *
  242. */
  243. public int validate(QName[] children, int offset, int length) {
  244. if (DEBUG_VALIDATE_CONTENT)
  245. System.out.println("DFAContentModel#validateContent");
  246. //
  247. // A DFA content model must *always* have at least 1 child
  248. // so a failure is given if no children present.
  249. //
  250. // Defect 782: This is an incorrect statement because a DFA
  251. // content model is also used for constructions such as:
  252. //
  253. // (Optional*,NotRequired?)
  254. //
  255. // where a perfectly valid content would be NO CHILDREN.
  256. // Therefore, if there are no children, we must check to
  257. // see if the CMNODE_EOC marker is a valid start state! -Ac
  258. //
  259. if (length == 0) {
  260. if (DEBUG_VALIDATE_CONTENT) {
  261. System.out.println("!!! no children");
  262. System.out.println("elemMap="+fElemMap);
  263. for (int i = 0; i < fElemMap.length; i++) {
  264. String uri = fElemMap[i].uri;
  265. String localpart = fElemMap[i].localpart;
  266. System.out.println("fElemMap["+i+"]="+uri+","+
  267. localpart+" ("+
  268. uri+", "+
  269. localpart+
  270. ')');
  271. }
  272. System.out.println("EOCIndex="+fEOCString);
  273. }
  274. return fEmptyContentIsValid ? -1 : 0;
  275. } // if child count == 0
  276. //
  277. // Lets loop through the children in the array and move our way
  278. // through the states. Note that we use the fElemMap array to map
  279. // an element index to a state index.
  280. //
  281. int curState = 0;
  282. for (int childIndex = 0; childIndex < length; childIndex++)
  283. {
  284. // Get the current element index out
  285. final QName curElem = children[offset + childIndex];
  286. // ignore mixed text
  287. if (fMixed && curElem.localpart == null) {
  288. continue;
  289. }
  290. // Look up this child in our element map
  291. int elemIndex = 0;
  292. for (; elemIndex < fElemMapSize; elemIndex++)
  293. {
  294. int type = fElemMapType[elemIndex] & 0x0f ;
  295. if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) {
  296. //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]);
  297. if (fElemMap[elemIndex].rawname == curElem.rawname) {
  298. break;
  299. }
  300. }
  301. else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) {
  302. String uri = fElemMap[elemIndex].uri;
  303. if (uri == null || uri == curElem.uri) {
  304. break;
  305. }
  306. }
  307. else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) {
  308. if (curElem.uri == null) {
  309. break;
  310. }
  311. }
  312. else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
  313. if (fElemMap[elemIndex].uri != curElem.uri) {
  314. break;
  315. }
  316. }
  317. }
  318. // If we didn't find it, then obviously not valid
  319. if (elemIndex == fElemMapSize) {
  320. if (DEBUG_VALIDATE_CONTENT) {
  321. System.out.println("!!! didn't find it");
  322. System.out.println("curElem : " +curElem );
  323. for (int i=0; i<fElemMapSize; i++) {
  324. System.out.println("fElemMap["+i+"] = " +fElemMap[i] );
  325. System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] );
  326. }
  327. }
  328. return childIndex;
  329. }
  330. //
  331. // Look up the next state for this input symbol when in the
  332. // current state.
  333. //
  334. curState = fTransTable[curState][elemIndex];
  335. // If its not a legal transition, then invalid
  336. if (curState == -1) {
  337. if (DEBUG_VALIDATE_CONTENT)
  338. System.out.println("!!! not a legal transition");
  339. return childIndex;
  340. }
  341. }
  342. //
  343. // We transitioned all the way through the input list. However, that
  344. // does not mean that we ended in a final state. So check whether
  345. // our ending state is a final state.
  346. //
  347. if (DEBUG_VALIDATE_CONTENT)
  348. System.out.println("curState="+curState+", childCount="+length);
  349. if (!fFinalStateFlags[curState])
  350. return length;
  351. // success!
  352. return -1;
  353. } // validate
  354. //
  355. // Private methods
  356. //
  357. /**
  358. * Builds the internal DFA transition table from the given syntax tree.
  359. *
  360. * @param syntaxTree The syntax tree.
  361. *
  362. * @exception CMException Thrown if DFA cannot be built.
  363. */
  364. private void buildDFA(CMNode syntaxTree)
  365. {
  366. //
  367. // The first step we need to take is to rewrite the content model
  368. // using our CMNode objects, and in the process get rid of any
  369. // repetition short cuts, converting them into '*' style repetitions
  370. // or getting rid of repetitions altogether.
  371. //
  372. // The conversions done are:
  373. //
  374. // x+ -> (x|x*)
  375. // x? -> (x|epsilon)
  376. //
  377. // This is a relatively complex scenario. What is happening is that
  378. // we create a top level binary node of which the special EOC value
  379. // is set as the right side node. The the left side is set to the
  380. // rewritten syntax tree. The source is the original content model
  381. // info from the decl pool. The rewrite is done by buildSyntaxTree()
  382. // which recurses the decl pool's content of the element and builds
  383. // a new tree in the process.
  384. //
  385. // Note that, during this operation, we set each non-epsilon leaf
  386. // node's DFA state position and count the number of such leafs, which
  387. // is left in the fLeafCount member.
  388. //
  389. // The nodeTmp object is passed in just as a temp node to use during
  390. // the recursion. Otherwise, we'd have to create a new node on every
  391. // level of recursion, which would be piggy in Java (as is everything
  392. // for that matter.)
  393. //
  394. /* MODIFIED (Jan, 2001)
  395. *
  396. * Use following rules.
  397. * nullable(x+) := nullable(x), first(x+) := first(x), last(x+) := last(x)
  398. * nullable(x?) := true, first(x?) := first(x), last(x?) := last(x)
  399. *
  400. * The same computation of follow as x* is applied to x+
  401. *
  402. * The modification drastically reduces computation time of
  403. * "(a, (b, a+, (c, (b, a+)+, a+, (d, (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+"
  404. */
  405. fQName.setValues(null, fEOCString, fEOCString, null);
  406. CMLeaf nodeEOC = new CMLeaf(fQName);
  407. fHeadNode = new CMBinOp
  408. (
  409. XMLContentSpec.CONTENTSPECNODE_SEQ
  410. , syntaxTree
  411. , nodeEOC
  412. );
  413. //
  414. // And handle specially the EOC node, which also must be numbered
  415. // and counted as a non-epsilon leaf node. It could not be handled
  416. // in the above tree build because it was created before all that
  417. // started. We save the EOC position since its used during the DFA
  418. // building loop.
  419. //
  420. fEOCPos = fLeafCount;
  421. nodeEOC.setPosition(fLeafCount++);
  422. //
  423. // Ok, so now we have to iterate the new tree and do a little more
  424. // work now that we know the leaf count. One thing we need to do is
  425. // to calculate the first and last position sets of each node. This
  426. // is cached away in each of the nodes.
  427. //
  428. // Along the way we also set the leaf count in each node as the
  429. // maximum state count. They must know this in order to create their
  430. // first/last pos sets.
  431. //
  432. // We also need to build an array of references to the non-epsilon
  433. // leaf nodes. Since we iterate it in the same way as before, this
  434. // will put them in the array according to their position values.
  435. //
  436. fLeafList = new CMLeaf[fLeafCount];
  437. fLeafListType = new int[fLeafCount];
  438. postTreeBuildInit(fHeadNode, 0);
  439. //
  440. // And, moving onward... We now need to build the follow position
  441. // sets for all the nodes. So we allocate an array of state sets,
  442. // one for each leaf node (i.e. each DFA position.)
  443. //
  444. fFollowList = new CMStateSet[fLeafCount];
  445. for (int index = 0; index < fLeafCount; index++)
  446. fFollowList[index] = new CMStateSet(fLeafCount);
  447. calcFollowList(fHeadNode);
  448. //
  449. // And finally the big push... Now we build the DFA using all the
  450. // states and the tree we've built up. First we set up the various
  451. // data structures we are going to use while we do this.
  452. //
  453. // First of all we need an array of unique element names in our
  454. // content model. For each transition table entry, we need a set of
  455. // contiguous indices to represent the transitions for a particular
  456. // input element. So we need to a zero based range of indexes that
  457. // map to element types. This element map provides that mapping.
  458. //
  459. fElemMap = new QName[fLeafCount];
  460. fElemMapType = new int[fLeafCount];
  461. fElemMapSize = 0;
  462. for (int outIndex = 0; outIndex < fLeafCount; outIndex++)
  463. {
  464. fElemMap[outIndex] = new QName();
  465. /****
  466. if ( (fLeafListType[outIndex] & 0x0f) != 0 ) {
  467. if (fLeafNameTypeVector == null) {
  468. fLeafNameTypeVector = new ContentLeafNameTypeVector();
  469. }
  470. }
  471. /****/
  472. // Get the current leaf's element index
  473. final QName element = fLeafList[outIndex].getElement();
  474. // See if the current leaf node's element index is in the list
  475. int inIndex = 0;
  476. for (; inIndex < fElemMapSize; inIndex++)
  477. {
  478. if (fElemMap[inIndex].rawname == element.rawname) {
  479. break;
  480. }
  481. }
  482. // If it was not in the list, then add it, if not the EOC node
  483. if (inIndex == fElemMapSize) {
  484. fElemMap[fElemMapSize].setValues(element);
  485. fElemMapType[fElemMapSize] = fLeafListType[outIndex];
  486. fElemMapSize++;
  487. }
  488. }
  489. // set up the fLeafNameTypeVector object if there is one.
  490. /*****
  491. if (fLeafNameTypeVector != null) {
  492. fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize);
  493. }
  494. /***
  495. * Optimization(Jan, 2001); We sort fLeafList according to
  496. * elemIndex which is *uniquely* associated to each leaf.
  497. * We are *assuming* that each element appears in at least one leaf.
  498. **/
  499. int[] fLeafSorter = new int[fLeafCount + fElemMapSize];
  500. int fSortCount = 0;
  501. for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
  502. for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) {
  503. final QName leaf = fLeafList[leafIndex].getElement();
  504. final QName element = fElemMap[elemIndex];
  505. if (leaf.rawname == element.rawname) {
  506. fLeafSorter[fSortCount++] = leafIndex;
  507. }
  508. }
  509. fLeafSorter[fSortCount++] = -1;
  510. }
  511. /* Optimization(Jan, 2001) */
  512. //
  513. // Next lets create some arrays, some that that hold transient
  514. // information during the DFA build and some that are permament.
  515. // These are kind of sticky since we cannot know how big they will
  516. // get, but we don't want to use any Java collections because of
  517. // performance.
  518. //
  519. // Basically they will probably be about fLeafCount*2 on average,
  520. // but can be as large as 2^(fLeafCount*2), worst case. So we start
  521. // with fLeafCount*4 as a middle ground. This will be very unlikely
  522. // to ever have to expand, though it if does, the overhead will be
  523. // somewhat ugly.
  524. //
  525. int curArraySize = fLeafCount * 4;
  526. CMStateSet[] statesToDo = new CMStateSet[curArraySize];
  527. fFinalStateFlags = new boolean[curArraySize];
  528. fTransTable = new int[curArraySize][];
  529. //
  530. // Ok we start with the initial set as the first pos set of the
  531. // head node (which is the seq node that holds the content model
  532. // and the EOC node.)
  533. //
  534. CMStateSet setT = fHeadNode.firstPos();
  535. //
  536. // Init our two state flags. Basically the unmarked state counter
  537. // is always chasing the current state counter. When it catches up,
  538. // that means we made a pass through that did not add any new states
  539. // to the lists, at which time we are done. We could have used a
  540. // expanding array of flags which we used to mark off states as we
  541. // complete them, but this is easier though less readable maybe.
  542. //
  543. int unmarkedState = 0;
  544. int curState = 0;
  545. //
  546. // Init the first transition table entry, and put the initial state
  547. // into the states to do list, then bump the current state.
  548. //
  549. fTransTable[curState] = makeDefStateList();
  550. statesToDo[curState] = setT;
  551. curState++;
  552. /* Optimization(Jan, 2001); This is faster for
  553. * a large content model such as, "(t001+|t002+|.... |t500+)".
  554. */
  555. java.util.Hashtable stateTable = new java.util.Hashtable();
  556. /* Optimization(Jan, 2001) */
  557. //
  558. // Ok, almost done with the algorithm... We now enter the
  559. // loop where we go until the states done counter catches up with
  560. // the states to do counter.
  561. //
  562. while (unmarkedState < curState)
  563. {
  564. //
  565. // Get the first unmarked state out of the list of states to do.
  566. // And get the associated transition table entry.
  567. //
  568. setT = statesToDo[unmarkedState];
  569. int[] transEntry = fTransTable[unmarkedState];
  570. // Mark this one final if it contains the EOC state
  571. fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos);
  572. // Bump up the unmarked state count, marking this state done
  573. unmarkedState++;
  574. // Loop through each possible input symbol in the element map
  575. CMStateSet newSet = null;
  576. /* Optimization(Jan, 2001) */
  577. int sorterIndex = 0;
  578. /* Optimization(Jan, 2001) */
  579. for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
  580. {
  581. //
  582. // Build up a set of states which is the union of all of
  583. // the follow sets of DFA positions that are in the current
  584. // state. If we gave away the new set last time through then
  585. // create a new one. Otherwise, zero out the existing one.
  586. //
  587. if (newSet == null)
  588. newSet = new CMStateSet(fLeafCount);
  589. else
  590. newSet.zeroBits();
  591. /* Optimization(Jan, 2001) */
  592. int leafIndex = fLeafSorter[sorterIndex++];
  593. while (leafIndex != -1) {
  594. // If this leaf index (DFA position) is in the current set...
  595. if (setT.getBit(leafIndex))
  596. {
  597. //
  598. // If this leaf is the current input symbol, then we
  599. // want to add its follow list to the set of states to
  600. // transition to from the current state.
  601. //
  602. newSet.union(fFollowList[leafIndex]);
  603. }
  604. leafIndex = fLeafSorter[sorterIndex++];
  605. }
  606. /* Optimization(Jan, 2001) */
  607. //
  608. // If this new set is not empty, then see if its in the list
  609. // of states to do. If not, then add it.
  610. //
  611. if (!newSet.isEmpty())
  612. {
  613. //
  614. // Search the 'states to do' list to see if this new
  615. // state set is already in there.
  616. //
  617. /* Optimization(Jan, 2001) */
  618. Integer stateObj = (Integer)stateTable.get(newSet);
  619. int stateIndex = (stateObj == null ? curState : stateObj.intValue());
  620. /* Optimization(Jan, 2001) */
  621. // If we did not find it, then add it
  622. if (stateIndex == curState)
  623. {
  624. //
  625. // Put this new state into the states to do and init
  626. // a new entry at the same index in the transition
  627. // table.
  628. //
  629. statesToDo[curState] = newSet;
  630. fTransTable[curState] = makeDefStateList();
  631. /* Optimization(Jan, 2001) */
  632. stateTable.put(newSet, new Integer(curState));
  633. /* Optimization(Jan, 2001) */
  634. // We now have a new state to do so bump the count
  635. curState++;
  636. //
  637. // Null out the new set to indicate we adopted it.
  638. // This will cause the creation of a new set on the
  639. // next time around the loop.
  640. //
  641. newSet = null;
  642. }
  643. //
  644. // Now set this state in the transition table's entry
  645. // for this element (using its index), with the DFA
  646. // state we will move to from the current state when we
  647. // see this input element.
  648. //
  649. transEntry[elemIndex] = stateIndex;
  650. // Expand the arrays if we're full
  651. if (curState == curArraySize)
  652. {
  653. //
  654. // Yikes, we overflowed the initial array size, so
  655. // we've got to expand all of these arrays. So adjust
  656. // up the size by 50% and allocate new arrays.
  657. //
  658. final int newSize = (int)(curArraySize * 1.5);
  659. CMStateSet[] newToDo = new CMStateSet[newSize];
  660. boolean[] newFinalFlags = new boolean[newSize];
  661. int[][] newTransTable = new int[newSize][];
  662. // Copy over all of the existing content
  663. for (int expIndex = 0; expIndex < curArraySize; expIndex++)
  664. {
  665. newToDo[expIndex] = statesToDo[expIndex];
  666. newFinalFlags[expIndex] = fFinalStateFlags[expIndex];
  667. newTransTable[expIndex] = fTransTable[expIndex];
  668. }
  669. // Store the new array size
  670. curArraySize = newSize;
  671. statesToDo = newToDo;
  672. fFinalStateFlags = newFinalFlags;
  673. fTransTable = newTransTable;
  674. }
  675. }
  676. }
  677. }
  678. // Check to see if we can set the fEmptyContentIsValid flag.
  679. fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable();
  680. //
  681. // And now we can say bye bye to the temp representation since we've
  682. // built the DFA.
  683. //
  684. if (DEBUG_VALIDATE_CONTENT)
  685. dumpTree(fHeadNode, 0);
  686. fHeadNode = null;
  687. fLeafList = null;
  688. fFollowList = null;
  689. }
  690. /**
  691. * Calculates the follow list of the current node.
  692. *
  693. * @param nodeCur The curent node.
  694. *
  695. * @exception CMException Thrown if follow list cannot be calculated.
  696. */
  697. private void calcFollowList(CMNode nodeCur)
  698. {
  699. // Recurse as required
  700. if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
  701. {
  702. // Recurse only
  703. calcFollowList(((CMBinOp)nodeCur).getLeft());
  704. calcFollowList(((CMBinOp)nodeCur).getRight());
  705. }
  706. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)
  707. {
  708. // Recurse first
  709. calcFollowList(((CMBinOp)nodeCur).getLeft());
  710. calcFollowList(((CMBinOp)nodeCur).getRight());
  711. //
  712. // Now handle our level. We use our left child's last pos
  713. // set and our right child's first pos set, so go ahead and
  714. // get them ahead of time.
  715. //
  716. final CMStateSet last = ((CMBinOp)nodeCur).getLeft().lastPos();
  717. final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos();
  718. //
  719. // Now, for every position which is in our left child's last set
  720. // add all of the states in our right child's first set to the
  721. // follow set for that position.
  722. //
  723. for (int index = 0; index < fLeafCount; index++)
  724. {
  725. if (last.getBit(index))
  726. fFollowList[index].union(first);
  727. }
  728. }
  729. /***
  730. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
  731. {
  732. // Recurse first
  733. calcFollowList(((CMUniOp)nodeCur).getChild());
  734. //
  735. // Now handle our level. We use our own first and last position
  736. // sets, so get them up front.
  737. //
  738. final CMStateSet first = nodeCur.firstPos();
  739. final CMStateSet last = nodeCur.lastPos();
  740. //
  741. // For every position which is in our last position set, add all
  742. // of our first position states to the follow set for that
  743. // position.
  744. //
  745. for (int index = 0; index < fLeafCount; index++)
  746. {
  747. if (last.getBit(index))
  748. fFollowList[index].union(first);
  749. }
  750. }
  751. else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
  752. || (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE))
  753. {
  754. throw new RuntimeException("ImplementationMessages.VAL_NIICM");
  755. }
  756. /***/
  757. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
  758. || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
  759. {
  760. // Recurse first
  761. calcFollowList(((CMUniOp)nodeCur).getChild());
  762. //
  763. // Now handle our level. We use our own first and last position
  764. // sets, so get them up front.
  765. //
  766. final CMStateSet first = nodeCur.firstPos();
  767. final CMStateSet last = nodeCur.lastPos();
  768. //
  769. // For every position which is in our last position set, add all
  770. // of our first position states to the follow set for that
  771. // position.
  772. //
  773. for (int index = 0; index < fLeafCount; index++)
  774. {
  775. if (last.getBit(index))
  776. fFollowList[index].union(first);
  777. }
  778. }
  779. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) {
  780. // Recurse only
  781. calcFollowList(((CMUniOp)nodeCur).getChild());
  782. }
  783. /***/
  784. }
  785. /**
  786. * Dumps the tree of the current node to standard output.
  787. *
  788. * @param nodeCur The current node.
  789. * @param level The maximum levels to output.
  790. *
  791. * @exception CMException Thrown on error.
  792. */
  793. private void dumpTree(CMNode nodeCur, int level)
  794. {
  795. for (int index = 0; index < level; index++)
  796. System.out.print(" ");
  797. int type = nodeCur.type();
  798. if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
  799. || (type == XMLContentSpec.CONTENTSPECNODE_SEQ))
  800. {
  801. if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
  802. System.out.print("Choice Node ");
  803. else
  804. System.out.print("Seq Node ");
  805. if (nodeCur.isNullable())
  806. System.out.print("Nullable ");
  807. System.out.print("firstPos=");
  808. System.out.print(nodeCur.firstPos().toString());
  809. System.out.print(" lastPos=");
  810. System.out.println(nodeCur.lastPos().toString());
  811. dumpTree(((CMBinOp)nodeCur).getLeft(), level+1);
  812. dumpTree(((CMBinOp)nodeCur).getRight(), level+1);
  813. }
  814. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
  815. {
  816. System.out.print("Rep Node ");
  817. if (nodeCur.isNullable())
  818. System.out.print("Nullable ");
  819. System.out.print("firstPos=");
  820. System.out.print(nodeCur.firstPos().toString());
  821. System.out.print(" lastPos=");
  822. System.out.println(nodeCur.lastPos().toString());
  823. dumpTree(((CMUniOp)nodeCur).getChild(), level+1);
  824. }
  825. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
  826. {
  827. System.out.print
  828. (
  829. "Leaf: (pos="
  830. + ((CMLeaf)nodeCur).getPosition()
  831. + "), "
  832. + ((CMLeaf)nodeCur).getElement()
  833. + "(elemIndex="
  834. + ((CMLeaf)nodeCur).getElement()
  835. + ") "
  836. );
  837. if (nodeCur.isNullable())
  838. System.out.print(" Nullable ");
  839. System.out.print("firstPos=");
  840. System.out.print(nodeCur.firstPos().toString());
  841. System.out.print(" lastPos=");
  842. System.out.println(nodeCur.lastPos().toString());
  843. }
  844. else
  845. {
  846. throw new RuntimeException("ImplementationMessages.VAL_NIICM");
  847. }
  848. }
  849. /**
  850. * -1 is used to represent bad transitions in the transition table
  851. * entry for each state. So each entry is initialized to an all -1
  852. * array. This method creates a new entry and initializes it.
  853. */
  854. private int[] makeDefStateList()
  855. {
  856. int[] retArray = new int[fElemMapSize];
  857. for (int index = 0; index < fElemMapSize; index++)
  858. retArray[index] = -1;
  859. return retArray;
  860. }
  861. /** Post tree build initialization. */
  862. private int postTreeBuildInit(CMNode nodeCur, int curIndex)
  863. {
  864. // Set the maximum states on this node
  865. nodeCur.setMaxStates(fLeafCount);
  866. // Recurse as required
  867. if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY ||
  868. (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL ||
  869. (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
  870. // REVISIT: Don't waste these structures.
  871. QName qname = new QName(null, null, null, ((CMAny)nodeCur).getURI());
  872. fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition());
  873. fLeafListType[curIndex] = nodeCur.type();
  874. curIndex++;
  875. }
  876. else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
  877. || (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ))
  878. {
  879. curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex);
  880. curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex);
  881. }
  882. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
  883. || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE
  884. || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)
  885. {
  886. curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex);
  887. }
  888. else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
  889. {
  890. //
  891. // Put this node in the leaf list at the current index if its
  892. // a non-epsilon leaf.
  893. //
  894. final QName node = ((CMLeaf)nodeCur).getElement();
  895. if (node.localpart != fEpsilonString) {
  896. fLeafList[curIndex] = (CMLeaf)nodeCur;
  897. fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF;
  898. curIndex++;
  899. }
  900. }
  901. else
  902. {
  903. throw new RuntimeException("ImplementationMessages.VAL_NIICM: type="+nodeCur.type());
  904. }
  905. return curIndex;
  906. }
  907. } // class DFAContentModel