1. /*
  2. * @(#)CodeSetConversion.java 1.19 04/03/01
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. package com.sun.corba.se.impl.encoding;
  8. import java.util.Map;
  9. import java.util.HashMap;
  10. import java.nio.ByteBuffer;
  11. import java.nio.CharBuffer;
  12. import java.nio.charset.Charset;
  13. import java.nio.charset.CharsetEncoder;
  14. import java.nio.charset.CharsetDecoder;
  15. import java.nio.charset.CharacterCodingException;
  16. import java.nio.charset.IllegalCharsetNameException;
  17. import java.nio.charset.MalformedInputException;
  18. import java.nio.charset.UnsupportedCharsetException;
  19. import java.nio.charset.UnmappableCharacterException;
  20. import com.sun.corba.se.impl.logging.ORBUtilSystemException;
  21. import com.sun.corba.se.impl.logging.OMGSystemException;
  22. import com.sun.corba.se.spi.logging.CORBALogDomains;
  23. /**
  24. * Collection of classes, interfaces, and factory methods for
  25. * CORBA code set conversion.
  26. *
  27. * This is mainly used to shield other code from the sun.io
  28. * converters which might change, as well as provide some basic
  29. * translation from conversion to CORBA error exceptions. Some
  30. * extra work is required here to facilitate the way CORBA
  31. * says it uses UTF-16 as of the 00-11-03 spec.
  32. *
  33. * REVISIT - Since the nio.Charset and nio.Charset.Encoder/Decoder
  34. * use NIO ByteBuffer and NIO CharBuffer, the interaction
  35. * and interface between this class and the CDR streams
  36. * should be looked at more closely for optimizations to
  37. * avoid unnecessary copying of data between char[] &
  38. * CharBuffer and byte[] & ByteBuffer, especially
  39. * DirectByteBuffers.
  40. *
  41. */
  42. public class CodeSetConversion
  43. {
  44. /**
  45. * Abstraction for char to byte conversion.
  46. *
  47. * Must be used in the proper sequence:
  48. *
  49. * 1) convert
  50. * 2) Optional getNumBytes and/or getAlignment (if necessary)
  51. * 3) getBytes (see warning)
  52. */
  53. public abstract static class CTBConverter
  54. {
  55. // Perform the conversion of the provided char or String,
  56. // allowing the caller to query for more information
  57. // before writing.
  58. public abstract void convert(char chToConvert);
  59. public abstract void convert(String strToConvert);
  60. // How many bytes resulted from the conversion?
  61. public abstract int getNumBytes();
  62. // What's the maximum number of bytes per character?
  63. public abstract float getMaxBytesPerChar();
  64. public abstract boolean isFixedWidthEncoding();
  65. // What byte boundary should the stream align to before
  66. // calling writeBytes? For instance, a fixed width
  67. // encoding with 2 bytes per char in a stream which
  68. // doesn't encapsulate the char's bytes should align
  69. // on a 2 byte boundary. (Ex: UTF16 in GIOP1.1)
  70. //
  71. // Note: This has no effect on the converted bytes. It
  72. // is just information available to the caller.
  73. public abstract int getAlignment();
  74. // Get the resulting bytes. Warning: You must use getNumBytes()
  75. // to determine the end of the data in the byte array instead
  76. // of array.length! The array may be used internally, so don't
  77. // save references.
  78. public abstract byte[] getBytes();
  79. }
  80. /**
  81. * Abstraction for byte to char conversion.
  82. */
  83. public abstract static class BTCConverter
  84. {
  85. // In GIOP 1.1, interoperability can only be achieved with
  86. // fixed width encodings like UTF-16. This is because wstrings
  87. // specified how many code points follow rather than specifying
  88. // the length in octets.
  89. public abstract boolean isFixedWidthEncoding();
  90. public abstract int getFixedCharWidth();
  91. // Called after getChars to determine the true size of the
  92. // converted array.
  93. public abstract int getNumChars();
  94. // Perform the conversion using length bytes from the given
  95. // input stream. Warning: You must use getNumChars() to
  96. // determine the correct length of the resulting array.
  97. // The same array may be used internally over multiple
  98. // calls.
  99. public abstract char[] getChars(byte[] bytes, int offset, int length);
  100. }
  101. /**
  102. * Implementation of CTBConverter which uses a nio.Charset.CharsetEncoder
  103. * to do the real work. Handles translation of exceptions to the
  104. * appropriate CORBA versions.
  105. */
  106. private class JavaCTBConverter extends CTBConverter
  107. {
  108. private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
  109. CORBALogDomains.RPC_ENCODING ) ;
  110. private OMGSystemException omgWrapper = OMGSystemException.get(
  111. CORBALogDomains.RPC_ENCODING ) ;
  112. // nio.Charset.CharsetEncoder actually does the work here
  113. // have to use it directly rather than through String's interface
  114. // because we want to know when errors occur during the conversion.
  115. private CharsetEncoder ctb;
  116. // Proper alignment for this type of converter. For instance,
  117. // ASCII has alignment of 1 (1 byte per char) but UTF16 has
  118. // alignment of 2 (2 bytes per char)
  119. private int alignment;
  120. // Char buffer to hold the input.
  121. private char[] chars = null;
  122. // How many bytes are generated from the conversion?
  123. private int numBytes = 0;
  124. // How many characters were converted (temporary variable
  125. // for cross method communication)
  126. private int numChars = 0;
  127. // ByteBuffer holding the converted input. This is necessary
  128. // since we have to do calculations that require the conversion
  129. // before writing the array to the stream.
  130. private ByteBuffer buffer;
  131. // What code set are we using?
  132. private OSFCodeSetRegistry.Entry codeset;
  133. public JavaCTBConverter(OSFCodeSetRegistry.Entry codeset,
  134. int alignmentForEncoding) {
  135. try {
  136. ctb = cache.getCharToByteConverter(codeset.getName());
  137. if (ctb == null) {
  138. Charset tmpCharset = Charset.forName(codeset.getName());
  139. ctb = tmpCharset.newEncoder();
  140. cache.setConverter(codeset.getName(), ctb);
  141. }
  142. } catch(IllegalCharsetNameException icne) {
  143. // This can only happen if one of our Entries has
  144. // an invalid name.
  145. throw wrapper.invalidCtbConverterName(icne,codeset.getName());
  146. } catch(UnsupportedCharsetException ucne) {
  147. // This can only happen if one of our Entries has
  148. // an unsupported name.
  149. throw wrapper.invalidCtbConverterName(ucne,codeset.getName());
  150. }
  151. this.codeset = codeset;
  152. alignment = alignmentForEncoding;
  153. }
  154. public final float getMaxBytesPerChar() {
  155. return ctb.maxBytesPerChar();
  156. }
  157. public void convert(char chToConvert) {
  158. if (chars == null)
  159. chars = new char[1];
  160. // The CharToByteConverter only takes a char[]
  161. chars[0] = chToConvert;
  162. numChars = 1;
  163. convertCharArray();
  164. }
  165. public void convert(String strToConvert) {
  166. // Try to save a memory allocation if possible. Usual
  167. // space/time trade off. If we could get the char[] out of
  168. // the String without copying, that would be great, but
  169. // it's forbidden since String is immutable.
  170. if (chars == null || chars.length < strToConvert.length())
  171. chars = new char[strToConvert.length()];
  172. numChars = strToConvert.length();
  173. strToConvert.getChars(0, numChars, chars, 0);
  174. convertCharArray();
  175. }
  176. public final int getNumBytes() {
  177. return numBytes;
  178. }
  179. public final int getAlignment() {
  180. return alignment;
  181. }
  182. public final boolean isFixedWidthEncoding() {
  183. return codeset.isFixedWidth();
  184. }
  185. public byte[] getBytes() {
  186. // Note that you can't use buffer.length since the buffer might
  187. // be larger than the actual number of converted bytes depending
  188. // on the encoding.
  189. return buffer.array();
  190. }
  191. private void convertCharArray() {
  192. try {
  193. // Possible optimization of directly converting into the CDR buffer.
  194. // However, that means the CDR code would have to reserve
  195. // a 4 byte string length ahead of time, and we'd need a
  196. // confusing partial conversion scheme for when we couldn't
  197. // fit everything in the buffer but needed to know the
  198. // converted length before proceeding due to fragmentation.
  199. // Then there's the issue of the chunking code.
  200. //
  201. // For right now, this is less messy and basic tests don't
  202. // show more than a 1 ms penalty worst case. Less than a
  203. // factor of 2 increase.
  204. // Convert the characters
  205. buffer = ctb.encode(CharBuffer.wrap(chars,0,numChars));
  206. // ByteBuffer returned by the encoder will set its limit
  207. // to byte immediately after the last written byte.
  208. numBytes = buffer.limit();
  209. } catch (IllegalStateException ise) {
  210. // an encoding operation is already in progress
  211. throw wrapper.ctbConverterFailure( ise ) ;
  212. } catch (MalformedInputException mie) {
  213. // There were illegal Unicode char pairs
  214. throw wrapper.badUnicodePair( mie ) ;
  215. } catch (UnmappableCharacterException uce) {
  216. // A character doesn't map to the desired code set
  217. // CORBA formal 00-11-03.
  218. throw omgWrapper.charNotInCodeset( uce ) ;
  219. } catch (CharacterCodingException cce) {
  220. // If this happens, then some other encoding error occured
  221. throw wrapper.ctbConverterFailure( cce ) ;
  222. }
  223. }
  224. }
  225. /**
  226. * Special UTF16 converter which can either always write a BOM
  227. * or use a specified byte order without one.
  228. */
  229. private class UTF16CTBConverter extends JavaCTBConverter
  230. {
  231. // Using this constructor, we will always write a BOM
  232. public UTF16CTBConverter() {
  233. super(OSFCodeSetRegistry.UTF_16, 2);
  234. }
  235. // Using this constructor, we don't use a BOM and use the
  236. // byte order specified
  237. public UTF16CTBConverter(boolean littleEndian) {
  238. super(littleEndian ?
  239. OSFCodeSetRegistry.UTF_16LE :
  240. OSFCodeSetRegistry.UTF_16BE,
  241. 2);
  242. }
  243. }
  244. /**
  245. * Implementation of BTCConverter which uses a sun.io.ByteToCharConverter
  246. * for the real work. Handles translation of exceptions to the
  247. * appropriate CORBA versions.
  248. */
  249. private class JavaBTCConverter extends BTCConverter
  250. {
  251. private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
  252. CORBALogDomains.RPC_ENCODING ) ;
  253. private OMGSystemException omgWrapper = OMGSystemException.get(
  254. CORBALogDomains.RPC_ENCODING ) ;
  255. protected CharsetDecoder btc;
  256. private char[] buffer;
  257. private int resultingNumChars;
  258. private OSFCodeSetRegistry.Entry codeset;
  259. public JavaBTCConverter(OSFCodeSetRegistry.Entry codeset) {
  260. // Obtain a Decoder
  261. btc = this.getConverter(codeset.getName());
  262. this.codeset = codeset;
  263. }
  264. public final boolean isFixedWidthEncoding() {
  265. return codeset.isFixedWidth();
  266. }
  267. // Should only be called if isFixedWidthEncoding is true
  268. // IMPORTANT: This calls OSFCodeSetRegistry.Entry, not
  269. // CharsetDecoder.maxCharsPerByte().
  270. public final int getFixedCharWidth() {
  271. return codeset.getMaxBytesPerChar();
  272. }
  273. public final int getNumChars() {
  274. return resultingNumChars;
  275. }
  276. public char[] getChars(byte[] bytes, int offset, int numBytes) {
  277. // Possible optimization of reading directly from the CDR
  278. // byte buffer. The sun.io converter supposedly can handle
  279. // incremental conversions in which a char is broken across
  280. // two convert calls.
  281. //
  282. // Basic tests didn't show more than a 1 ms increase
  283. // worst case. It's less than a factor of 2 increase.
  284. // Also makes the interface more difficult.
  285. try {
  286. ByteBuffer byteBuf = ByteBuffer.wrap(bytes, offset, numBytes);
  287. CharBuffer charBuf = btc.decode(byteBuf);
  288. // CharBuffer returned by the decoder will set its limit
  289. // to byte immediately after the last written byte.
  290. resultingNumChars = charBuf.limit();
  291. // IMPORTANT - It's possible the underlying char[] in the
  292. // CharBuffer returned by btc.decode(byteBuf)
  293. // is longer in length than the number of characters
  294. // decoded. Hence, the check below to ensure the
  295. // char[] returned contains all the chars that have
  296. // been decoded and no more.
  297. if (charBuf.limit() == charBuf.capacity()) {
  298. buffer = charBuf.array();
  299. } else {
  300. buffer = new char[charBuf.limit()];
  301. charBuf.get(buffer, 0, charBuf.limit()).position(0);
  302. }
  303. return buffer;
  304. } catch (IllegalStateException ile) {
  305. // There were a decoding operation already in progress
  306. throw wrapper.btcConverterFailure( ile ) ;
  307. } catch (MalformedInputException mie) {
  308. // There were illegal Unicode char pairs
  309. throw wrapper.badUnicodePair( mie ) ;
  310. } catch (UnmappableCharacterException uce) {
  311. // A character doesn't map to the desired code set.
  312. // CORBA formal 00-11-03.
  313. throw omgWrapper.charNotInCodeset( uce ) ;
  314. } catch (CharacterCodingException cce) {
  315. // If this happens, then a character decoding error occured.
  316. throw wrapper.btcConverterFailure( cce ) ;
  317. }
  318. }
  319. /**
  320. * Utility method to find a CharsetDecoder in the
  321. * cache or create a new one if necessary. Throws an
  322. * INTERNAL if the code set is unknown.
  323. */
  324. protected CharsetDecoder getConverter(String javaCodeSetName) {
  325. CharsetDecoder result = null;
  326. try {
  327. result = cache.getByteToCharConverter(javaCodeSetName);
  328. if (result == null) {
  329. Charset tmpCharset = Charset.forName(javaCodeSetName);
  330. result = tmpCharset.newDecoder();
  331. cache.setConverter(javaCodeSetName, result);
  332. }
  333. } catch(IllegalCharsetNameException icne) {
  334. // This can only happen if one of our charset entries has
  335. // an illegal name.
  336. throw wrapper.invalidBtcConverterName( icne, javaCodeSetName ) ;
  337. }
  338. return result;
  339. }
  340. }
  341. /**
  342. * Special converter for UTF16 since it's required to optionally
  343. * support a byte order marker while the internal Java converters
  344. * either require it or require that it isn't there.
  345. *
  346. * The solution is to check for the byte order marker, and if we
  347. * need to do something differently, switch internal converters.
  348. */
  349. private class UTF16BTCConverter extends JavaBTCConverter
  350. {
  351. private boolean defaultToLittleEndian;
  352. private boolean converterUsesBOM = true;
  353. private static final char UTF16_BE_MARKER = (char) 0xfeff;
  354. private static final char UTF16_LE_MARKER = (char) 0xfffe;
  355. // When there isn't a byte order marker, used the byte
  356. // order specified.
  357. public UTF16BTCConverter(boolean defaultToLittleEndian) {
  358. super(OSFCodeSetRegistry.UTF_16);
  359. this.defaultToLittleEndian = defaultToLittleEndian;
  360. }
  361. public char[] getChars(byte[] bytes, int offset, int numBytes) {
  362. if (hasUTF16ByteOrderMarker(bytes, offset, numBytes)) {
  363. if (!converterUsesBOM)
  364. switchToConverter(OSFCodeSetRegistry.UTF_16);
  365. converterUsesBOM = true;
  366. return super.getChars(bytes, offset, numBytes);
  367. } else {
  368. if (converterUsesBOM) {
  369. if (defaultToLittleEndian)
  370. switchToConverter(OSFCodeSetRegistry.UTF_16LE);
  371. else
  372. switchToConverter(OSFCodeSetRegistry.UTF_16BE);
  373. converterUsesBOM = false;
  374. }
  375. return super.getChars(bytes, offset, numBytes);
  376. }
  377. }
  378. /**
  379. * Utility method for determining if a UTF-16 byte order marker is present.
  380. */
  381. private boolean hasUTF16ByteOrderMarker(byte[] array, int offset, int length) {
  382. // If there aren't enough bytes to represent the marker and data,
  383. // return false.
  384. if (length >= 4) {
  385. int b1 = array[offset] & 0x00FF;
  386. int b2 = array[offset + 1] & 0x00FF;
  387. char marker = (char)((b1 << 8) | (b2 << 0));
  388. return (marker == UTF16_BE_MARKER || marker == UTF16_LE_MARKER);
  389. } else
  390. return false;
  391. }
  392. /**
  393. * The current solution for dealing with UTF-16 in CORBA
  394. * is that if our sun.io converter requires byte order markers,
  395. * and then we see a CORBA wstring/wchar without them, we
  396. * switch to the sun.io converter that doesn't require them.
  397. */
  398. private void switchToConverter(OSFCodeSetRegistry.Entry newCodeSet) {
  399. // Use the getConverter method from our superclass.
  400. btc = super.getConverter(newCodeSet.getName());
  401. }
  402. }
  403. /**
  404. * CTB converter factory for single byte or variable length encodings.
  405. */
  406. public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset) {
  407. int alignment = (!codeset.isFixedWidth() ?
  408. 1 :
  409. codeset.getMaxBytesPerChar());
  410. return new JavaCTBConverter(codeset, alignment);
  411. }
  412. /**
  413. * CTB converter factory for multibyte (mainly fixed) encodings.
  414. *
  415. * Because of the awkwardness with byte order markers and the possibility of
  416. * using UCS-2, you must specify both the endianness of the stream as well as
  417. * whether or not to use byte order markers if applicable. UCS-2 has no byte
  418. * order markers. UTF-16 has optional markers.
  419. *
  420. * If you select useByteOrderMarkers, there is no guarantee that the encoding
  421. * will use the endianness specified.
  422. *
  423. */
  424. public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset,
  425. boolean littleEndian,
  426. boolean useByteOrderMarkers) {
  427. // UCS2 doesn't have byte order markers, and we're encoding it
  428. // as UTF-16 since UCS2 isn't available in all Java platforms.
  429. // They should be identical with only minor differences in
  430. // negative cases.
  431. if (codeset == OSFCodeSetRegistry.UCS_2)
  432. return new UTF16CTBConverter(littleEndian);
  433. // We can write UTF-16 with or without a byte order marker.
  434. if (codeset == OSFCodeSetRegistry.UTF_16) {
  435. if (useByteOrderMarkers)
  436. return new UTF16CTBConverter();
  437. else
  438. return new UTF16CTBConverter(littleEndian);
  439. }
  440. // Everything else uses the generic JavaCTBConverter.
  441. //
  442. // Variable width encodings are aligned on 1 byte boundaries.
  443. // A fixed width encoding with a max. of 4 bytes/char should
  444. // align on a 4 byte boundary. Note that UTF-16 is a special
  445. // case because of the optional byte order marker, so it's
  446. // handled above.
  447. //
  448. // This doesn't matter for GIOP 1.2 wchars and wstrings
  449. // since the encoded bytes are treated as an encapsulation.
  450. int alignment = (!codeset.isFixedWidth() ?
  451. 1 :
  452. codeset.getMaxBytesPerChar());
  453. return new JavaCTBConverter(codeset, alignment);
  454. }
  455. /**
  456. * BTCConverter factory for single byte or variable width encodings.
  457. */
  458. public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset) {
  459. return new JavaBTCConverter(codeset);
  460. }
  461. /**
  462. * BTCConverter factory for fixed width multibyte encodings.
  463. */
  464. public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset,
  465. boolean defaultToLittleEndian) {
  466. if (codeset == OSFCodeSetRegistry.UTF_16 ||
  467. codeset == OSFCodeSetRegistry.UCS_2) {
  468. return new UTF16BTCConverter(defaultToLittleEndian);
  469. } else {
  470. return new JavaBTCConverter(codeset);
  471. }
  472. }
  473. /**
  474. * Follows the code set negotiation algorithm in CORBA formal 99-10-07 13.7.2.
  475. *
  476. * Returns the proper negotiated OSF character encoding number or
  477. * CodeSetConversion.FALLBACK_CODESET.
  478. */
  479. private int selectEncoding(CodeSetComponentInfo.CodeSetComponent client,
  480. CodeSetComponentInfo.CodeSetComponent server) {
  481. // A "null" value for the server's nativeCodeSet means that
  482. // the server desired not to indicate one. We'll take that
  483. // to mean that it wants the first thing in its conversion list.
  484. // If it's conversion list is empty, too, then use the fallback
  485. // codeset.
  486. int serverNative = server.nativeCodeSet;
  487. if (serverNative == 0) {
  488. if (server.conversionCodeSets.length > 0)
  489. serverNative = server.conversionCodeSets[0];
  490. else
  491. return CodeSetConversion.FALLBACK_CODESET;
  492. }
  493. if (client.nativeCodeSet == serverNative) {
  494. // Best case -- client and server don't have to convert
  495. return serverNative;
  496. }
  497. // Is this client capable of converting to the server's
  498. // native code set?
  499. for (int i = 0; i < client.conversionCodeSets.length; i++) {
  500. if (serverNative == client.conversionCodeSets[i]) {
  501. // The client will convert to the server's
  502. // native code set.
  503. return serverNative;
  504. }
  505. }
  506. // Is the server capable of converting to the client's
  507. // native code set?
  508. for (int i = 0; i < server.conversionCodeSets.length; i++) {
  509. if (client.nativeCodeSet == server.conversionCodeSets[i]) {
  510. // The server will convert to the client's
  511. // native code set.
  512. return client.nativeCodeSet;
  513. }
  514. }
  515. // See if there are any code sets that both the server and client
  516. // support (giving preference to the server). The order
  517. // of conversion sets is from most to least desired.
  518. for (int i = 0; i < server.conversionCodeSets.length; i++) {
  519. for (int y = 0; y < client.conversionCodeSets.length; y++) {
  520. if (server.conversionCodeSets[i] == client.conversionCodeSets[y]) {
  521. return server.conversionCodeSets[i];
  522. }
  523. }
  524. }
  525. // Before using the fallback codesets, the spec calls for a
  526. // compatibility check on the native code sets. It doesn't make
  527. // sense because loss free communication is always possible with
  528. // UTF8 and UTF16, the fall back code sets. It's also a lot
  529. // of work to implement. In the case of incompatibility, the
  530. // spec says to throw a CODESET_INCOMPATIBLE exception.
  531. // Use the fallback
  532. return CodeSetConversion.FALLBACK_CODESET;
  533. }
  534. /**
  535. * Perform the code set negotiation algorithm and come up with
  536. * the two encodings to use.
  537. */
  538. public CodeSetComponentInfo.CodeSetContext negotiate(CodeSetComponentInfo client,
  539. CodeSetComponentInfo server) {
  540. int charData
  541. = selectEncoding(client.getCharComponent(),
  542. server.getCharComponent());
  543. if (charData == CodeSetConversion.FALLBACK_CODESET) {
  544. charData = OSFCodeSetRegistry.UTF_8.getNumber();
  545. }
  546. int wcharData
  547. = selectEncoding(client.getWCharComponent(),
  548. server.getWCharComponent());
  549. if (wcharData == CodeSetConversion.FALLBACK_CODESET) {
  550. wcharData = OSFCodeSetRegistry.UTF_16.getNumber();
  551. }
  552. return new CodeSetComponentInfo.CodeSetContext(charData,
  553. wcharData);
  554. }
  555. // No one should instantiate a CodeSetConversion but the singleton
  556. // instance method
  557. private CodeSetConversion() {}
  558. // initialize-on-demand holder
  559. private static class CodeSetConversionHolder {
  560. static final CodeSetConversion csc = new CodeSetConversion() ;
  561. }
  562. /**
  563. * CodeSetConversion is a singleton, and this is the access point.
  564. */
  565. public final static CodeSetConversion impl() {
  566. return CodeSetConversionHolder.csc ;
  567. }
  568. // Singleton instance
  569. private static CodeSetConversion implementation;
  570. // Number used internally to indicate the fallback code
  571. // set.
  572. private static final int FALLBACK_CODESET = 0;
  573. // Provides a thread local cache for the sun.io
  574. // converters.
  575. private CodeSetCache cache = new CodeSetCache();
  576. }