1. /*
  2. * @(#)CodeSetConversion.java 1.16 03/01/23
  3. *
  4. * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. package com.sun.corba.se.internal.core;
  8. import java.util.Map;
  9. import java.util.HashMap;
  10. import java.io.UnsupportedEncodingException;
  11. import sun.io.*;
  12. import org.omg.CORBA.INTERNAL;
  13. import org.omg.CORBA.DATA_CONVERSION;
  14. import org.omg.CORBA.CompletionStatus;
  15. import com.sun.corba.se.internal.orbutil.MinorCodes;
  16. /**
  17. * Collection of classes, interfaces, and factory methods for
  18. * CORBA code set conversion.
  19. *
  20. * This is mainly used to shield other code from the sun.io
  21. * converters which might change, as well as provide some basic
  22. * translation from conversion to CORBA error exceptions. Some
  23. * extra work is required here to facilitate the way CORBA
  24. * says it uses UTF-16 as of the 00-11-03 spec.
  25. */
  26. public class CodeSetConversion
  27. {
  28. /**
  29. * Abstraction for char to byte conversion.
  30. *
  31. * Must be used in the proper sequence:
  32. *
  33. * 1) convert
  34. * 2) Optional getNumBytes and/or getAlignment (if necessary)
  35. * 3) getBytes (see warning)
  36. */
  37. public abstract static class CTBConverter
  38. {
  39. // Perform the conversion of the provided char or String,
  40. // allowing the caller to query for more information
  41. // before writing.
  42. public abstract void convert(char chToConvert);
  43. public abstract void convert(String strToConvert);
  44. // How many bytes resulted from the conversion?
  45. public abstract int getNumBytes();
  46. // What's the maximum number of bytes per character?
  47. public abstract int getMaxBytesPerChar();
  48. public abstract boolean isFixedWidthEncoding();
  49. // What byte boundary should the stream align to before
  50. // calling writeBytes? For instance, a fixed width
  51. // encoding with 2 bytes per char in a stream which
  52. // doesn't encapsulate the char's bytes should align
  53. // on a 2 byte boundary. (Ex: UTF16 in GIOP1.1)
  54. //
  55. // Note: This has no effect on the converted bytes. It
  56. // is just information available to the caller.
  57. public abstract int getAlignment();
  58. // Get the resulting bytes. Warning: You must use getNumBytes()
  59. // to determine the end of the data in the byte array instead
  60. // of array.length! The array may be used internally, so don't
  61. // save references.
  62. public abstract byte[] getBytes();
  63. }
  64. /**
  65. * Abstraction for byte to char conversion.
  66. */
  67. public abstract static class BTCConverter
  68. {
  69. // In GIOP 1.1, interoperability can only be achieved with
  70. // fixed width encodings like UTF-16. This is because wstrings
  71. // specified how many code points follow rather than specifying
  72. // the length in octets.
  73. public abstract boolean isFixedWidthEncoding();
  74. public abstract int getFixedCharWidth();
  75. // Called after getChars to determine the true size of the
  76. // converted array.
  77. public abstract int getNumChars();
  78. // Perform the conversion using length bytes from the given
  79. // input stream. Warning: You must use getNumChars() to
  80. // determine the correct length of the resulting array.
  81. // The same array may be used internally over multiple
  82. // calls.
  83. public abstract char[] getChars(byte[] bytes, int offset, int length);
  84. }
  85. /**
  86. * Implementation of CTBConverter which uses a sun.io.CharToByteConverter
  87. * to do the real work. Handles translation of exceptions to the
  88. * appropriate CORBA versions.
  89. */
  90. private class JavaCTBConverter extends CTBConverter
  91. {
  92. // sun.io.CharToByteConverter which actually does the work. We
  93. // have to use it directly rather than through String's interface
  94. // because we want to know when errors occur during the conversion.
  95. // It also allows us to do one less allocation and copy.
  96. private CharToByteConverter ctb;
  97. // Proper alignment for this type of converter. For instance,
  98. // ASCII has alignment of 1 (1 byte per char) but UTF16 has
  99. // alignment of 2 (2 bytes per char)
  100. private int alignment;
  101. // Char buffer to hold the input. Maintained across multiple
  102. // conversions to save a memory allocation (maybe a non-issue)
  103. // and make single char conversion faster.
  104. private char[] chars = null;
  105. // How many bytes are generated from the conversion?
  106. private int numBytes = 0;
  107. // How many characters were converted (temporary variable
  108. // for cross method communication)
  109. private int numChars = 0;
  110. // Byte buffer holding the converted input. This is necessary
  111. // since we have to do calculations that require the conversion
  112. // before writing the array to the stream. It's reused
  113. // across multiple conversions, so don't do another conversion
  114. // until you've disposed of your reference!
  115. private byte[] buffer;
  116. // What code set are we using?
  117. private OSFCodeSetRegistry.Entry codeset;
  118. public JavaCTBConverter(OSFCodeSetRegistry.Entry codeset,
  119. int alignmentForEncoding) {
  120. try {
  121. ctb = cache.getCharToByteConverter(codeset.getName());
  122. if (ctb == null) {
  123. ctb = CharToByteConverter.getConverter(codeset.getName());
  124. cache.setConverter(codeset.getName(), ctb);
  125. // When substitution mode is false, the converter
  126. // will throw exceptions when it encounters illegal
  127. // encodings.
  128. ctb.setSubstitutionMode(false);
  129. }
  130. } catch(UnsupportedEncodingException uee) {
  131. // This can only happen if one of our Entries has
  132. // an invalid name.
  133. throw new INTERNAL("Invalid converter name: " + codeset.getName());
  134. }
  135. this.codeset = codeset;
  136. alignment = alignmentForEncoding;
  137. }
  138. public final int getMaxBytesPerChar() {
  139. return ctb.getMaxBytesPerChar();
  140. }
  141. public void convert(char chToConvert) {
  142. if (chars == null)
  143. chars = new char[1];
  144. // The CharToByteConverter only takes a char[]
  145. chars[0] = chToConvert;
  146. numChars = 1;
  147. convertCharArray();
  148. }
  149. public void convert(String strToConvert) {
  150. // Try to save a memory allocation if possible. Usual
  151. // space/time trade off. If we could get the char[] out of
  152. // the String without copying, that would be great, but
  153. // it's forbidden since String is immutable.
  154. if (chars == null || chars.length < strToConvert.length())
  155. chars = new char[strToConvert.length()];
  156. numChars = strToConvert.length();
  157. strToConvert.getChars(0, numChars, chars, 0);
  158. convertCharArray();
  159. }
  160. public final int getNumBytes() {
  161. return numBytes;
  162. }
  163. public final int getAlignment() {
  164. return alignment;
  165. }
  166. public final boolean isFixedWidthEncoding() {
  167. return codeset.isFixedWidth();
  168. }
  169. public byte[] getBytes() {
  170. // Note that you can't use buffer.length since the byte array might
  171. // be larger than the actual number of converted bytes depending
  172. // on the encoding.
  173. //
  174. // Warning 2: The byte array is reused across multiple calls to the
  175. // converter!
  176. return buffer;
  177. }
  178. private void convertCharArray() {
  179. try {
  180. // Possible optimization of directly converting into the CDR buffer.
  181. // However, that means the CDR code would have to reserve
  182. // a 4 byte string length ahead of time, and we'd need a
  183. // confusing partial conversion scheme for when we couldn't
  184. // fit everything in the buffer but needed to know the
  185. // converted length before proceeding due to fragmentation.
  186. // Then there's the issue of the chunking code.
  187. //
  188. // For right now, this is less messy and basic tests don't
  189. // show more than a 1 ms penalty worst case. Less than a
  190. // factor of 2 increase.
  191. if (buffer == null || buffer.length < numChars * ctb.getMaxBytesPerChar())
  192. buffer = new byte[numChars * ctb.getMaxBytesPerChar()];
  193. // Return the converter to its initial state
  194. ctb.reset();
  195. // Convert the characters
  196. numBytes = ctb.convert(chars, 0, numChars,
  197. buffer, 0, buffer.length);
  198. // Converters must be flushed to finish up
  199. numBytes += ctb.flush(buffer, 0, buffer.length);
  200. } catch (MalformedInputException mie) {
  201. // There were illegal Unicode char pairs
  202. throw new DATA_CONVERSION(mie.getMessage(),
  203. MinorCodes.BAD_UNICODE_PAIR,
  204. CompletionStatus.COMPLETED_NO);
  205. } catch (UnknownCharacterException uce) {
  206. // A character doesn't map to the desired code set
  207. // CORBA formal 00-11-03.
  208. throw new DATA_CONVERSION(uce.getMessage(),
  209. MinorCodes.CHAR_NOT_IN_CODESET,
  210. CompletionStatus.COMPLETED_NO);
  211. } catch (ConversionBufferFullException cbfe) {
  212. // If this happens, then the CharToByteConverter was lying
  213. // about the maximum bytes per char.
  214. throw new INTERNAL(cbfe.getMessage(),
  215. MinorCodes.CTB_CONVERTER_FAILURE,
  216. CompletionStatus.COMPLETED_NO);
  217. }
  218. }
  219. }
  220. /**
  221. * Special UTF16 converter which can either always write a BOM
  222. * or use a specified byte order without one.
  223. */
  224. private class UTF16CTBConverter extends JavaCTBConverter
  225. {
  226. // Using this constructor, we will always write a BOM
  227. public UTF16CTBConverter() {
  228. super(OSFCodeSetRegistry.UTF_16, 2);
  229. }
  230. // Using this constructor, we don't use a BOM and use the
  231. // byte order specified
  232. public UTF16CTBConverter(boolean littleEndian) {
  233. super(littleEndian ?
  234. OSFCodeSetRegistry.UTF_16LE :
  235. OSFCodeSetRegistry.UTF_16BE,
  236. 2);
  237. }
  238. }
  239. /**
  240. * Implementation of BTCConverter which uses a sun.io.ByteToCharConverter
  241. * for the real work. Handles translation of exceptions to the
  242. * appropriate CORBA versions.
  243. */
  244. private class JavaBTCConverter extends BTCConverter
  245. {
  246. protected ByteToCharConverter btc;
  247. private char[] buffer;
  248. private int resultingNumChars;
  249. private OSFCodeSetRegistry.Entry codeset;
  250. public JavaBTCConverter(OSFCodeSetRegistry.Entry codeset) {
  251. // Obtain a ByteToCharConverter
  252. btc = getConverter(codeset.getName());
  253. this.codeset = codeset;
  254. }
  255. public final boolean isFixedWidthEncoding() {
  256. return codeset.isFixedWidth();
  257. }
  258. // Should only be called if isFixedWidthEncoding is true
  259. public final int getFixedCharWidth() {
  260. return codeset.getMaxBytesPerChar();
  261. }
  262. public final int getNumChars() {
  263. return resultingNumChars;
  264. }
  265. public char[] getChars(byte[] bytes, int offset, int numBytes) {
  266. // Possible optimization of reading directly from the CDR
  267. // byte buffer. The sun.io converter supposedly can handle
  268. // incremental conversions in which a char is broken across
  269. // two convert calls.
  270. //
  271. // Basic tests didn't show more than a 1 ms increase
  272. // worst case. It's less than a factor of 2 increase.
  273. // Also makes the interface more difficult.
  274. if (buffer == null || buffer.length < numBytes * btc.getMaxCharsPerByte())
  275. buffer = new char[numBytes * btc.getMaxCharsPerByte()];
  276. try {
  277. btc.reset();
  278. // WARNING: The signature for convert is
  279. // bytes[], offset, endPosition
  280. // not
  281. // bytes[], offset, total length
  282. resultingNumChars = btc.convert(bytes, offset, offset + numBytes,
  283. buffer, 0, buffer.length);
  284. resultingNumChars += btc.flush(buffer, 0, buffer.length);
  285. return buffer;
  286. } catch (MalformedInputException mie) {
  287. // There were illegal Unicode char pairs
  288. throw new DATA_CONVERSION(mie.getMessage(),
  289. MinorCodes.BAD_UNICODE_PAIR,
  290. CompletionStatus.COMPLETED_NO);
  291. } catch (UnknownCharacterException uce) {
  292. // A character doesn't map to the desired code set.
  293. // CORBA formal 00-11-03.
  294. throw new DATA_CONVERSION(uce.getMessage(),
  295. MinorCodes.CHAR_NOT_IN_CODESET,
  296. CompletionStatus.COMPLETED_NO);
  297. } catch (ConversionBufferFullException cbfe) {
  298. // If this happens, then the CharToByteConverter was lying
  299. // about the maximum chars per byte.
  300. throw new INTERNAL(cbfe.getMessage(),
  301. MinorCodes.BTC_CONVERTER_FAILURE,
  302. CompletionStatus.COMPLETED_NO);
  303. }
  304. }
  305. /**
  306. * Utility method to find a ByteToCharConverter in the
  307. * cache or create a new one if necessary. Throws an
  308. * INTERNAL if the code set is unknown.
  309. */
  310. protected ByteToCharConverter getConverter(String javaCodeSetName) {
  311. try {
  312. ByteToCharConverter result
  313. = cache.getByteToCharConverter(javaCodeSetName);
  314. if (result == null) {
  315. result = ByteToCharConverter.getConverter(javaCodeSetName);
  316. cache.setConverter(javaCodeSetName, result);
  317. // When substitution mode is false, the converter
  318. // will throw exceptions when it encounters illegal
  319. // encodings.
  320. result.setSubstitutionMode(false);
  321. }
  322. return result;
  323. } catch(UnsupportedEncodingException uee) {
  324. // This can only happen if one of our entries has
  325. // an invalid name.
  326. throw new INTERNAL("Invalid converter name: " + javaCodeSetName);
  327. }
  328. }
  329. }
  330. /**
  331. * Special converter for UTF16 since it's required to optionally
  332. * support a byte order marker while the internal Java converters
  333. * either require it or require that it isn't there.
  334. *
  335. * The solution is to check for the byte order marker, and if we
  336. * need to do something differently, switch internal converters.
  337. */
  338. private class UTF16BTCConverter extends JavaBTCConverter
  339. {
  340. private boolean defaultToLittleEndian;
  341. private boolean converterUsesBOM = true;
  342. private static final char UTF16_BE_MARKER = (char) 0xfeff;
  343. private static final char UTF16_LE_MARKER = (char) 0xfffe;
  344. // When there isn't a byte order marker, used the byte
  345. // order specified.
  346. public UTF16BTCConverter(boolean defaultToLittleEndian) {
  347. super(OSFCodeSetRegistry.UTF_16);
  348. this.defaultToLittleEndian = defaultToLittleEndian;
  349. }
  350. public char[] getChars(byte[] bytes, int offset, int numBytes) {
  351. if (hasUTF16ByteOrderMarker(bytes, offset, numBytes)) {
  352. if (!converterUsesBOM)
  353. switchToConverter(OSFCodeSetRegistry.UTF_16);
  354. converterUsesBOM = true;
  355. return super.getChars(bytes, offset, numBytes);
  356. } else {
  357. if (converterUsesBOM) {
  358. if (defaultToLittleEndian)
  359. switchToConverter(OSFCodeSetRegistry.UTF_16LE);
  360. else
  361. switchToConverter(OSFCodeSetRegistry.UTF_16BE);
  362. converterUsesBOM = false;
  363. }
  364. return super.getChars(bytes, offset, numBytes);
  365. }
  366. }
  367. /**
  368. * Utility method for determining if a UTF-16 byte order marker is present.
  369. */
  370. private boolean hasUTF16ByteOrderMarker(byte[] array, int offset, int length) {
  371. // If there aren't enough bytes to represent the marker and data,
  372. // return false.
  373. if (length >= 4) {
  374. int b1 = array[offset] & 0x00FF;
  375. int b2 = array[offset + 1] & 0x00FF;
  376. char marker = (char)((b1 << 8) | (b2 << 0));
  377. return (marker == UTF16_BE_MARKER || marker == UTF16_LE_MARKER);
  378. } else
  379. return false;
  380. }
  381. /**
  382. * The current solution for dealing with UTF-16 in CORBA
  383. * is that if our sun.io converter requires byte order markers,
  384. * and then we see a CORBA wstring/wchar without them, we
  385. * switch to the sun.io converter that doesn't require them.
  386. */
  387. private void switchToConverter(OSFCodeSetRegistry.Entry newCodeSet) {
  388. // Use the getConverter method from our superclass.
  389. btc = getConverter(newCodeSet.getName());
  390. }
  391. }
  392. /**
  393. * CTB converter factory for single byte or variable length encodings.
  394. */
  395. public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset) {
  396. int alignment = (!codeset.isFixedWidth() ?
  397. 1 :
  398. codeset.getMaxBytesPerChar());
  399. return new JavaCTBConverter(codeset, alignment);
  400. }
  401. /**
  402. * CTB converter factory for multibyte (mainly fixed) encodings.
  403. *
  404. * Because of the awkwardness with byte order markers and the possibility of
  405. * using UCS-2, you must specify both the endianness of the stream as well as
  406. * whether or not to use byte order markers if applicable. UCS-2 has no byte
  407. * order markers. UTF-16 has optional markers.
  408. *
  409. * If you select useByteOrderMarkers, there is no guarantee that the encoding
  410. * will use the endianness specified.
  411. *
  412. */
  413. public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset,
  414. boolean littleEndian,
  415. boolean useByteOrderMarkers) {
  416. // UCS2 doesn't have byte order markers, and we're encoding it
  417. // as UTF-16 since UCS2 isn't available in all Java platforms.
  418. // They should be identical with only minor differences in
  419. // negative cases.
  420. if (codeset == OSFCodeSetRegistry.UCS_2)
  421. return new UTF16CTBConverter(littleEndian);
  422. // We can write UTF-16 with or without a byte order marker.
  423. if (codeset == OSFCodeSetRegistry.UTF_16) {
  424. if (useByteOrderMarkers)
  425. return new UTF16CTBConverter();
  426. else
  427. return new UTF16CTBConverter(littleEndian);
  428. }
  429. // Everything else uses the generic JavaCTBConverter.
  430. //
  431. // Variable width encodings are aligned on 1 byte boundaries.
  432. // A fixed width encoding with a max. of 4 bytes/char should
  433. // align on a 4 byte boundary. Note that UTF-16 is a special
  434. // case because of the optional byte order marker, so it's
  435. // handled above.
  436. //
  437. // This doesn't matter for GIOP 1.2 wchars and wstrings
  438. // since the encoded bytes are treated as an encapsulation.
  439. int alignment = (!codeset.isFixedWidth() ?
  440. 1 :
  441. codeset.getMaxBytesPerChar());
  442. return new JavaCTBConverter(codeset, alignment);
  443. }
  444. /**
  445. * BTCConverter factory for single byte or variable width encodings.
  446. */
  447. public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset) {
  448. return new JavaBTCConverter(codeset);
  449. }
  450. /**
  451. * BTCConverter factory for fixed width multibyte encodings.
  452. */
  453. public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset,
  454. boolean defaultToLittleEndian) {
  455. if (codeset == OSFCodeSetRegistry.UTF_16 ||
  456. codeset == OSFCodeSetRegistry.UCS_2) {
  457. return new UTF16BTCConverter(defaultToLittleEndian);
  458. } else {
  459. return new JavaBTCConverter(codeset);
  460. }
  461. }
  462. /**
  463. * Follows the code set negotiation algorithm in CORBA formal 99-10-07 13.7.2.
  464. *
  465. * Returns the proper negotiated OSF character encoding number or
  466. * CodeSetConversion.FALLBACK_CODESET.
  467. */
  468. private int selectEncoding(CodeSetComponentInfo.CodeSetComponent client,
  469. CodeSetComponentInfo.CodeSetComponent server) {
  470. // A "null" value for the server's nativeCodeSet means that
  471. // the server desired not to indicate one. We'll take that
  472. // to mean that it wants the first thing in its conversion list.
  473. // If it's conversion list is empty, too, then use the fallback
  474. // codeset.
  475. int serverNative = server.nativeCodeSet;
  476. if (serverNative == 0) {
  477. if (server.conversionCodeSets.length > 0)
  478. serverNative = server.conversionCodeSets[0];
  479. else
  480. return CodeSetConversion.FALLBACK_CODESET;
  481. }
  482. if (client.nativeCodeSet == serverNative) {
  483. // Best case -- client and server don't have to convert
  484. return serverNative;
  485. }
  486. // Is this client capable of converting to the server's
  487. // native code set?
  488. for (int i = 0; i < client.conversionCodeSets.length; i++) {
  489. if (serverNative == client.conversionCodeSets[i]) {
  490. // The client will convert to the server's
  491. // native code set.
  492. return serverNative;
  493. }
  494. }
  495. // Is the server capable of converting to the client's
  496. // native code set?
  497. for (int i = 0; i < server.conversionCodeSets.length; i++) {
  498. if (client.nativeCodeSet == server.conversionCodeSets[i]) {
  499. // The server will convert to the client's
  500. // native code set.
  501. return client.nativeCodeSet;
  502. }
  503. }
  504. // See if there are any code sets that both the server and client
  505. // support (giving preference to the server). The order
  506. // of conversion sets is from most to least desired.
  507. for (int i = 0; i < server.conversionCodeSets.length; i++) {
  508. for (int y = 0; y < client.conversionCodeSets.length; y++) {
  509. if (server.conversionCodeSets[i] == client.conversionCodeSets[y]) {
  510. return server.conversionCodeSets[i];
  511. }
  512. }
  513. }
  514. // Before using the fallback codesets, the spec calls for a
  515. // compatibility check on the native code sets. It doesn't make
  516. // sense because loss free communication is always possible with
  517. // UTF8 and UTF16, the fall back code sets. It's also a lot
  518. // of work to implement. In the case of incompatibility, the
  519. // spec says to throw a CODESET_INCOMPATIBLE exception.
  520. // Use the fallback
  521. return CodeSetConversion.FALLBACK_CODESET;
  522. }
  523. /**
  524. * Perform the code set negotiation algorithm and come up with
  525. * the two encodings to use.
  526. */
  527. public CodeSetComponentInfo.CodeSetContext negotiate(CodeSetComponentInfo client,
  528. CodeSetComponentInfo server) {
  529. int charData
  530. = selectEncoding(client.getCharComponent(),
  531. server.getCharComponent());
  532. if (charData == CodeSetConversion.FALLBACK_CODESET) {
  533. charData = OSFCodeSetRegistry.UTF_8.getNumber();
  534. }
  535. int wcharData
  536. = selectEncoding(client.getWCharComponent(),
  537. server.getWCharComponent());
  538. if (wcharData == CodeSetConversion.FALLBACK_CODESET) {
  539. wcharData = OSFCodeSetRegistry.UTF_16.getNumber();
  540. }
  541. return new CodeSetComponentInfo.CodeSetContext(charData,
  542. wcharData);
  543. }
  544. // No one should instantiate a CodeSetConversion but the singleton
  545. // instance method
  546. private CodeSetConversion() {}
  547. // initialize-on-demand holder
  548. private static class CodeSetConversionHolder {
  549. static final CodeSetConversion csc = new CodeSetConversion() ;
  550. }
  551. /**
  552. * CodeSetConversion is a singleton, and this is the access point.
  553. */
  554. public final static CodeSetConversion impl() {
  555. return CodeSetConversionHolder.csc ;
  556. }
  557. // Singleton instance
  558. private static CodeSetConversion implementation;
  559. // Number used internally to indicate the fallback code
  560. // set.
  561. private static final int FALLBACK_CODESET = 0;
  562. // Provides a thread local cache for the sun.io
  563. // converters.
  564. private CodeSetCache cache = new CodeSetCache();
  565. }