1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. // Aug 21, 2000:
  58. // Fixed bug in isElement and made HTMLdtd public.
  59. // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
  60. package com.sun.org.apache.xml.internal.serialize;
  61. import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
  62. import java.io.InputStream;
  63. import java.io.InputStreamReader;
  64. import java.io.BufferedReader;
  65. import java.util.Hashtable;
  66. import java.util.Locale;
  67. /**
  68. * Utility class for accessing information specific to HTML documents.
  69. * The HTML DTD is expressed as three utility function groups. Two methods
  70. * allow for checking whether an element requires an open tag on printing
  71. * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
  72. * <P>
  73. * Two other methods translate character references from name to value and
  74. * from value to name. A small entities resource is loaded into memory the
  75. * first time any of these methods is called for fast and efficient access.
  76. *
  77. *
  78. * @version $Revision: 1.17 $ $Date: 2004/02/10 17:25:26 $
  79. * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  80. */
  81. public final class HTMLdtd
  82. {
  83. /**
  84. * Public identifier for HTML 4.01 (Strict) document type.
  85. */
  86. public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
  87. /**
  88. * System identifier for HTML 4.01 (Strict) document type.
  89. */
  90. public static final String HTMLSystemId =
  91. "http://www.w3.org/TR/html4/strict.dtd";
  92. /**
  93. * Public identifier for XHTML 1.0 (Strict) document type.
  94. */
  95. public static final String XHTMLPublicId =
  96. "-//W3C//DTD XHTML 1.0 Strict//EN";
  97. /**
  98. * System identifier for XHTML 1.0 (Strict) document type.
  99. */
  100. public static final String XHTMLSystemId =
  101. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
  102. /**
  103. * Table of reverse character reference mapping. Character codes are held
  104. * as single-character strings, mapped to their reference name.
  105. */
  106. private static Hashtable _byChar;
  107. /**
  108. * Table of entity name to value mapping. Entities are held as strings,
  109. * character references as <TT>Character</TT> objects.
  110. */
  111. private static Hashtable _byName;
  112. private static Hashtable _boolAttrs;
  113. /**
  114. * Holds element definitions.
  115. */
  116. private static Hashtable _elemDefs;
  117. /**
  118. * Locates the HTML entities file that is loaded upon initialization.
  119. * This file is a resource loaded with the default class loader.
  120. */
  121. private static final String ENTITIES_RESOURCE = "HTMLEntities.res";
  122. /**
  123. * Only opening tag should be printed.
  124. */
  125. private static final int ONLY_OPENING = 0x0001;
  126. /**
  127. * Element contains element content only.
  128. */
  129. private static final int ELEM_CONTENT = 0x0002;
  130. /**
  131. * Element preserve spaces.
  132. */
  133. private static final int PRESERVE = 0x0004;
  134. /**
  135. * Optional closing tag.
  136. */
  137. private static final int OPT_CLOSING = 0x0008;
  138. /**
  139. * Element is empty (also means only opening tag)
  140. */
  141. private static final int EMPTY = 0x0010 | ONLY_OPENING;
  142. /**
  143. * Allowed to appear in head.
  144. */
  145. private static final int ALLOWED_HEAD = 0x0020;
  146. /**
  147. * When opened, closes P.
  148. */
  149. private static final int CLOSE_P = 0x0040;
  150. /**
  151. * When opened, closes DD or DT.
  152. */
  153. private static final int CLOSE_DD_DT = 0x0080;
  154. /**
  155. * When opened, closes itself.
  156. */
  157. private static final int CLOSE_SELF = 0x0100;
  158. /**
  159. * When opened, closes another table section.
  160. */
  161. private static final int CLOSE_TABLE = 0x0200;
  162. /**
  163. * When opened, closes TH or TD.
  164. */
  165. private static final int CLOSE_TH_TD = 0x04000;
  166. /**
  167. * Returns true if element is declared to be empty. HTML elements are
  168. * defines as empty in the DTD, not by the document syntax.
  169. *
  170. * @param tagName The element tag name (upper case)
  171. * @return True if element is empty
  172. */
  173. public static boolean isEmptyTag( String tagName )
  174. {
  175. return isElement( tagName, EMPTY );
  176. }
  177. /**
  178. * Returns true if element is declared to have element content.
  179. * Whitespaces appearing inside element content will be ignored,
  180. * other text will simply report an error.
  181. *
  182. * @param tagName The element tag name (upper case)
  183. * @return True if element content
  184. */
  185. public static boolean isElementContent( String tagName )
  186. {
  187. return isElement( tagName, ELEM_CONTENT );
  188. }
  189. /**
  190. * Returns true if element's textual contents preserves spaces.
  191. * This only applies to PRE and TEXTAREA, all other HTML elements
  192. * do not preserve space.
  193. *
  194. * @param tagName The element tag name (upper case)
  195. * @return True if element's text content preserves spaces
  196. */
  197. public static boolean isPreserveSpace( String tagName )
  198. {
  199. return isElement( tagName, PRESERVE );
  200. }
  201. /**
  202. * Returns true if element's closing tag is optional and need not
  203. * exist. An error will not be reported for such elements if they
  204. * are not closed. For example, <tt>LI</tt> is most often not closed.
  205. *
  206. * @param tagName The element tag name (upper case)
  207. * @return True if closing tag implied
  208. */
  209. public static boolean isOptionalClosing( String tagName )
  210. {
  211. return isElement( tagName, OPT_CLOSING );
  212. }
  213. /**
  214. * Returns true if element's closing tag is generally not printed.
  215. * For example, <tt>LI</tt> should not print the closing tag.
  216. *
  217. * @param tagName The element tag name (upper case)
  218. * @return True if only opening tag should be printed
  219. */
  220. public static boolean isOnlyOpening( String tagName )
  221. {
  222. return isElement( tagName, ONLY_OPENING );
  223. }
  224. /**
  225. * Returns true if the opening of one element (<tt>tagName</tt>) implies
  226. * the closing of another open element (<tt>openTag</tt>). For example,
  227. * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
  228. * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
  229. *
  230. * @param tagName The newly opened element
  231. * @param openTag The already opened element
  232. * @return True if closing tag closes opening tag
  233. */
  234. public static boolean isClosing( String tagName, String openTag )
  235. {
  236. // Several elements are defined as closing the HEAD
  237. if ( openTag.equalsIgnoreCase( "HEAD" ) )
  238. return ! isElement( tagName, ALLOWED_HEAD );
  239. // P closes iteself
  240. if ( openTag.equalsIgnoreCase( "P" ) )
  241. return isElement( tagName, CLOSE_P );
  242. // DT closes DD, DD closes DT
  243. if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
  244. return isElement( tagName, CLOSE_DD_DT );
  245. // LI and OPTION close themselves
  246. if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
  247. return isElement( tagName, CLOSE_SELF );
  248. // Each of these table sections closes all the others
  249. if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
  250. openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
  251. openTag.equalsIgnoreCase( "COLGROUP" ) )
  252. return isElement( tagName, CLOSE_TABLE );
  253. // TD closes TH and TH closes TD
  254. if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
  255. return isElement( tagName, CLOSE_TH_TD );
  256. return false;
  257. }
  258. /**
  259. * Returns true if the specified attribute it a URI and should be
  260. * escaped appropriately. In HTML URIs are escaped differently
  261. * than normal attributes.
  262. *
  263. * @param tagName The element's tag name
  264. * @param attrName The attribute's name
  265. */
  266. public static boolean isURI( String tagName, String attrName )
  267. {
  268. // Stupid checks.
  269. return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
  270. }
  271. /**
  272. * Returns true if the specified attribute is a boolean and should be
  273. * printed without the value. This applies to attributes that are true
  274. * if they exist, such as selected (OPTION/INPUT).
  275. *
  276. * @param tagName The element's tag name
  277. * @param attrName The attribute's name
  278. */
  279. public static boolean isBoolean( String tagName, String attrName )
  280. {
  281. String[] attrNames;
  282. attrNames = (String[]) _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
  283. if ( attrNames == null )
  284. return false;
  285. for ( int i = 0 ; i < attrNames.length ; ++i )
  286. if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
  287. return true;
  288. return false;
  289. }
  290. /**
  291. * Returns the value of an HTML character reference by its name. If the
  292. * reference is not found or was not defined as a character reference,
  293. * returns EOF (-1).
  294. *
  295. * @param name Name of character reference
  296. * @return Character code or EOF (-1)
  297. */
  298. public static int charFromName( String name )
  299. {
  300. Object value;
  301. initialize();
  302. value = _byName.get( name );
  303. if ( value != null && value instanceof Integer )
  304. return ( (Integer) value ).intValue();
  305. else
  306. return -1;
  307. }
  308. /**
  309. * Returns the name of an HTML character reference based on its character
  310. * value. Only valid for entities defined from character references. If no
  311. * such character value was defined, return null.
  312. *
  313. * @param value Character value of entity
  314. * @return Entity's name or null
  315. */
  316. public static String fromChar(int value )
  317. {
  318. if (value > 0xffff)
  319. return null;
  320. String name;
  321. initialize();
  322. name = (String) _byChar.get( new Integer( value ) );
  323. return name;
  324. }
  325. /**
  326. * Initialize upon first access. Will load all the HTML character references
  327. * into a list that is accessible by name or character value and is optimized
  328. * for character substitution. This method may be called any number of times
  329. * but will execute only once.
  330. */
  331. private static void initialize()
  332. {
  333. InputStream is = null;
  334. BufferedReader reader = null;
  335. int index;
  336. String name;
  337. String value;
  338. int code;
  339. String line;
  340. // Make sure not to initialize twice.
  341. if ( _byName != null )
  342. return;
  343. try {
  344. _byName = new Hashtable();
  345. _byChar = new Hashtable();
  346. is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
  347. if ( is == null ) {
  348. throw new RuntimeException(
  349. DOMMessageFormatter.formatMessage(
  350. DOMMessageFormatter.SERIALIZER_DOMAIN,
  351. "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
  352. }
  353. reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
  354. line = reader.readLine();
  355. while ( line != null ) {
  356. if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
  357. line = reader.readLine();
  358. continue;
  359. }
  360. index = line.indexOf( ' ' );
  361. if ( index > 1 ) {
  362. name = line.substring( 0, index );
  363. ++index;
  364. if ( index < line.length() ) {
  365. value = line.substring( index );
  366. index = value.indexOf( ' ' );
  367. if ( index > 0 )
  368. value = value.substring( 0, index );
  369. code = Integer.parseInt( value );
  370. defineEntity( name, (char) code );
  371. }
  372. }
  373. line = reader.readLine();
  374. }
  375. is.close();
  376. } catch ( Exception except ) {
  377. throw new RuntimeException(
  378. DOMMessageFormatter.formatMessage(
  379. DOMMessageFormatter.SERIALIZER_DOMAIN,
  380. "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
  381. } finally {
  382. if ( is != null ) {
  383. try {
  384. is.close();
  385. } catch ( Exception except ) { }
  386. }
  387. }
  388. }
  389. /**
  390. * Defines a new character reference. The reference's name and value are
  391. * supplied. Nothing happens if the character reference is already defined.
  392. * <P>
  393. * Unlike internal entities, character references are a string to single
  394. * character mapping. They are used to map non-ASCII characters both on
  395. * parsing and printing, primarily for HTML documents. '<amp;' is an
  396. * example of a character reference.
  397. *
  398. * @param name The entity's name
  399. * @param value The entity's value
  400. */
  401. private static void defineEntity( String name, char value )
  402. {
  403. if ( _byName.get( name ) == null ) {
  404. _byName.put( name, new Integer( value ) );
  405. _byChar.put( new Integer( value ), name );
  406. }
  407. }
  408. private static void defineElement( String name, int flags )
  409. {
  410. _elemDefs.put( name, new Integer( flags ) );
  411. }
  412. private static void defineBoolean( String tagName, String attrName )
  413. {
  414. defineBoolean( tagName, new String[] { attrName } );
  415. }
  416. private static void defineBoolean( String tagName, String[] attrNames )
  417. {
  418. _boolAttrs.put( tagName, attrNames );
  419. }
  420. private static boolean isElement( String name, int flag )
  421. {
  422. Integer flags;
  423. flags = (Integer) _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
  424. if ( flags == null )
  425. return false;
  426. else
  427. return ( ( flags.intValue() & flag ) == flag );
  428. }
  429. static
  430. {
  431. _elemDefs = new Hashtable();
  432. defineElement( "ADDRESS", CLOSE_P );
  433. defineElement( "AREA", EMPTY );
  434. defineElement( "BASE", EMPTY | ALLOWED_HEAD );
  435. defineElement( "BASEFONT", EMPTY );
  436. defineElement( "BLOCKQUOTE", CLOSE_P );
  437. defineElement( "BODY", OPT_CLOSING );
  438. defineElement( "BR", EMPTY );
  439. defineElement( "COL", EMPTY );
  440. defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
  441. defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
  442. defineElement( "DIV", CLOSE_P );
  443. defineElement( "DL", ELEM_CONTENT | CLOSE_P );
  444. defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
  445. defineElement( "FIELDSET", CLOSE_P );
  446. defineElement( "FORM", CLOSE_P );
  447. defineElement( "FRAME", EMPTY | OPT_CLOSING );
  448. defineElement( "H1", CLOSE_P );
  449. defineElement( "H2", CLOSE_P );
  450. defineElement( "H3", CLOSE_P );
  451. defineElement( "H4", CLOSE_P );
  452. defineElement( "H5", CLOSE_P );
  453. defineElement( "H6", CLOSE_P );
  454. defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
  455. defineElement( "HR", EMPTY | CLOSE_P );
  456. defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
  457. defineElement( "IMG", EMPTY );
  458. defineElement( "INPUT", EMPTY );
  459. defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
  460. defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
  461. defineElement( "LINK", EMPTY | ALLOWED_HEAD );
  462. defineElement( "MAP", ALLOWED_HEAD );
  463. defineElement( "META", EMPTY | ALLOWED_HEAD );
  464. defineElement( "OL", ELEM_CONTENT | CLOSE_P );
  465. defineElement( "OPTGROUP", ELEM_CONTENT );
  466. defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
  467. defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
  468. defineElement( "PARAM", EMPTY );
  469. defineElement( "PRE", PRESERVE | CLOSE_P );
  470. defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
  471. defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
  472. defineElement( "SELECT", ELEM_CONTENT );
  473. defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
  474. defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
  475. defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
  476. defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
  477. defineElement( "TEXTAREA", PRESERVE );
  478. defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
  479. defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
  480. defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
  481. defineElement( "TITLE", ALLOWED_HEAD );
  482. defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
  483. defineElement( "UL", ELEM_CONTENT | CLOSE_P );
  484. _boolAttrs = new Hashtable();
  485. defineBoolean( "AREA", "href" );
  486. defineBoolean( "BUTTON", "disabled" );
  487. defineBoolean( "DIR", "compact" );
  488. defineBoolean( "DL", "compact" );
  489. defineBoolean( "FRAME", "noresize" );
  490. defineBoolean( "HR", "noshade" );
  491. defineBoolean( "IMAGE", "ismap" );
  492. defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
  493. defineBoolean( "LINK", "link" );
  494. defineBoolean( "MENU", "compact" );
  495. defineBoolean( "OBJECT", "declare" );
  496. defineBoolean( "OL", "compact" );
  497. defineBoolean( "OPTGROUP", "disabled" );
  498. defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
  499. defineBoolean( "SCRIPT", "defer" );
  500. defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
  501. defineBoolean( "STYLE", "disabled" );
  502. defineBoolean( "TD", "nowrap" );
  503. defineBoolean( "TH", "nowrap" );
  504. defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
  505. defineBoolean( "UL", "compact" );
  506. initialize();
  507. }
  508. }