- /*
- * @(#)CodeSetConversion.java 1.19 04/03/01
- *
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
- */
- package com.sun.corba.se.impl.encoding;
-
- import java.util.Map;
- import java.util.HashMap;
- import java.nio.ByteBuffer;
- import java.nio.CharBuffer;
- import java.nio.charset.Charset;
- import java.nio.charset.CharsetEncoder;
- import java.nio.charset.CharsetDecoder;
- import java.nio.charset.CharacterCodingException;
- import java.nio.charset.IllegalCharsetNameException;
- import java.nio.charset.MalformedInputException;
- import java.nio.charset.UnsupportedCharsetException;
- import java.nio.charset.UnmappableCharacterException;
- import com.sun.corba.se.impl.logging.ORBUtilSystemException;
- import com.sun.corba.se.impl.logging.OMGSystemException;
- import com.sun.corba.se.spi.logging.CORBALogDomains;
-
- /**
- * Collection of classes, interfaces, and factory methods for
- * CORBA code set conversion.
- *
- * This is mainly used to shield other code from the sun.io
- * converters which might change, as well as provide some basic
- * translation from conversion to CORBA error exceptions. Some
- * extra work is required here to facilitate the way CORBA
- * says it uses UTF-16 as of the 00-11-03 spec.
- *
- * REVISIT - Since the nio.Charset and nio.Charset.Encoder/Decoder
- * use NIO ByteBuffer and NIO CharBuffer, the interaction
- * and interface between this class and the CDR streams
- * should be looked at more closely for optimizations to
- * avoid unnecessary copying of data between char[] &
- * CharBuffer and byte[] & ByteBuffer, especially
- * DirectByteBuffers.
- *
- */
- public class CodeSetConversion
- {
- /**
- * Abstraction for char to byte conversion.
- *
- * Must be used in the proper sequence:
- *
- * 1) convert
- * 2) Optional getNumBytes and/or getAlignment (if necessary)
- * 3) getBytes (see warning)
- */
- public abstract static class CTBConverter
- {
- // Perform the conversion of the provided char or String,
- // allowing the caller to query for more information
- // before writing.
- public abstract void convert(char chToConvert);
- public abstract void convert(String strToConvert);
-
- // How many bytes resulted from the conversion?
- public abstract int getNumBytes();
-
- // What's the maximum number of bytes per character?
- public abstract float getMaxBytesPerChar();
-
- public abstract boolean isFixedWidthEncoding();
-
- // What byte boundary should the stream align to before
- // calling writeBytes? For instance, a fixed width
- // encoding with 2 bytes per char in a stream which
- // doesn't encapsulate the char's bytes should align
- // on a 2 byte boundary. (Ex: UTF16 in GIOP1.1)
- //
- // Note: This has no effect on the converted bytes. It
- // is just information available to the caller.
- public abstract int getAlignment();
-
- // Get the resulting bytes. Warning: You must use getNumBytes()
- // to determine the end of the data in the byte array instead
- // of array.length! The array may be used internally, so don't
- // save references.
- public abstract byte[] getBytes();
- }
-
- /**
- * Abstraction for byte to char conversion.
- */
- public abstract static class BTCConverter
- {
- // In GIOP 1.1, interoperability can only be achieved with
- // fixed width encodings like UTF-16. This is because wstrings
- // specified how many code points follow rather than specifying
- // the length in octets.
- public abstract boolean isFixedWidthEncoding();
- public abstract int getFixedCharWidth();
-
- // Called after getChars to determine the true size of the
- // converted array.
- public abstract int getNumChars();
-
- // Perform the conversion using length bytes from the given
- // input stream. Warning: You must use getNumChars() to
- // determine the correct length of the resulting array.
- // The same array may be used internally over multiple
- // calls.
- public abstract char[] getChars(byte[] bytes, int offset, int length);
- }
-
- /**
- * Implementation of CTBConverter which uses a nio.Charset.CharsetEncoder
- * to do the real work. Handles translation of exceptions to the
- * appropriate CORBA versions.
- */
- private class JavaCTBConverter extends CTBConverter
- {
- private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
- CORBALogDomains.RPC_ENCODING ) ;
-
- private OMGSystemException omgWrapper = OMGSystemException.get(
- CORBALogDomains.RPC_ENCODING ) ;
-
- // nio.Charset.CharsetEncoder actually does the work here
- // have to use it directly rather than through String's interface
- // because we want to know when errors occur during the conversion.
- private CharsetEncoder ctb;
-
- // Proper alignment for this type of converter. For instance,
- // ASCII has alignment of 1 (1 byte per char) but UTF16 has
- // alignment of 2 (2 bytes per char)
- private int alignment;
-
- // Char buffer to hold the input.
- private char[] chars = null;
-
- // How many bytes are generated from the conversion?
- private int numBytes = 0;
-
- // How many characters were converted (temporary variable
- // for cross method communication)
- private int numChars = 0;
-
- // ByteBuffer holding the converted input. This is necessary
- // since we have to do calculations that require the conversion
- // before writing the array to the stream.
- private ByteBuffer buffer;
-
- // What code set are we using?
- private OSFCodeSetRegistry.Entry codeset;
-
- public JavaCTBConverter(OSFCodeSetRegistry.Entry codeset,
- int alignmentForEncoding) {
-
- try {
- ctb = cache.getCharToByteConverter(codeset.getName());
- if (ctb == null) {
- Charset tmpCharset = Charset.forName(codeset.getName());
- ctb = tmpCharset.newEncoder();
- cache.setConverter(codeset.getName(), ctb);
- }
- } catch(IllegalCharsetNameException icne) {
-
- // This can only happen if one of our Entries has
- // an invalid name.
- throw wrapper.invalidCtbConverterName(icne,codeset.getName());
- } catch(UnsupportedCharsetException ucne) {
-
- // This can only happen if one of our Entries has
- // an unsupported name.
- throw wrapper.invalidCtbConverterName(ucne,codeset.getName());
- }
-
- this.codeset = codeset;
- alignment = alignmentForEncoding;
- }
-
- public final float getMaxBytesPerChar() {
- return ctb.maxBytesPerChar();
- }
-
- public void convert(char chToConvert) {
- if (chars == null)
- chars = new char[1];
-
- // The CharToByteConverter only takes a char[]
- chars[0] = chToConvert;
- numChars = 1;
-
- convertCharArray();
- }
-
- public void convert(String strToConvert) {
- // Try to save a memory allocation if possible. Usual
- // space/time trade off. If we could get the char[] out of
- // the String without copying, that would be great, but
- // it's forbidden since String is immutable.
- if (chars == null || chars.length < strToConvert.length())
- chars = new char[strToConvert.length()];
-
- numChars = strToConvert.length();
-
- strToConvert.getChars(0, numChars, chars, 0);
-
- convertCharArray();
- }
-
- public final int getNumBytes() {
- return numBytes;
- }
-
- public final int getAlignment() {
- return alignment;
- }
-
- public final boolean isFixedWidthEncoding() {
- return codeset.isFixedWidth();
- }
-
- public byte[] getBytes() {
- // Note that you can't use buffer.length since the buffer might
- // be larger than the actual number of converted bytes depending
- // on the encoding.
- return buffer.array();
- }
-
- private void convertCharArray() {
- try {
-
- // Possible optimization of directly converting into the CDR buffer.
- // However, that means the CDR code would have to reserve
- // a 4 byte string length ahead of time, and we'd need a
- // confusing partial conversion scheme for when we couldn't
- // fit everything in the buffer but needed to know the
- // converted length before proceeding due to fragmentation.
- // Then there's the issue of the chunking code.
- //
- // For right now, this is less messy and basic tests don't
- // show more than a 1 ms penalty worst case. Less than a
- // factor of 2 increase.
-
- // Convert the characters
- buffer = ctb.encode(CharBuffer.wrap(chars,0,numChars));
-
- // ByteBuffer returned by the encoder will set its limit
- // to byte immediately after the last written byte.
- numBytes = buffer.limit();
-
- } catch (IllegalStateException ise) {
- // an encoding operation is already in progress
- throw wrapper.ctbConverterFailure( ise ) ;
- } catch (MalformedInputException mie) {
- // There were illegal Unicode char pairs
- throw wrapper.badUnicodePair( mie ) ;
- } catch (UnmappableCharacterException uce) {
- // A character doesn't map to the desired code set
- // CORBA formal 00-11-03.
- throw omgWrapper.charNotInCodeset( uce ) ;
- } catch (CharacterCodingException cce) {
- // If this happens, then some other encoding error occured
- throw wrapper.ctbConverterFailure( cce ) ;
- }
- }
- }
-
- /**
- * Special UTF16 converter which can either always write a BOM
- * or use a specified byte order without one.
- */
- private class UTF16CTBConverter extends JavaCTBConverter
- {
- // Using this constructor, we will always write a BOM
- public UTF16CTBConverter() {
- super(OSFCodeSetRegistry.UTF_16, 2);
- }
-
- // Using this constructor, we don't use a BOM and use the
- // byte order specified
- public UTF16CTBConverter(boolean littleEndian) {
- super(littleEndian ?
- OSFCodeSetRegistry.UTF_16LE :
- OSFCodeSetRegistry.UTF_16BE,
- 2);
- }
- }
-
- /**
- * Implementation of BTCConverter which uses a sun.io.ByteToCharConverter
- * for the real work. Handles translation of exceptions to the
- * appropriate CORBA versions.
- */
- private class JavaBTCConverter extends BTCConverter
- {
- private ORBUtilSystemException wrapper = ORBUtilSystemException.get(
- CORBALogDomains.RPC_ENCODING ) ;
-
- private OMGSystemException omgWrapper = OMGSystemException.get(
- CORBALogDomains.RPC_ENCODING ) ;
-
- protected CharsetDecoder btc;
- private char[] buffer;
- private int resultingNumChars;
- private OSFCodeSetRegistry.Entry codeset;
-
- public JavaBTCConverter(OSFCodeSetRegistry.Entry codeset) {
-
- // Obtain a Decoder
- btc = this.getConverter(codeset.getName());
-
- this.codeset = codeset;
- }
-
- public final boolean isFixedWidthEncoding() {
- return codeset.isFixedWidth();
- }
-
- // Should only be called if isFixedWidthEncoding is true
- // IMPORTANT: This calls OSFCodeSetRegistry.Entry, not
- // CharsetDecoder.maxCharsPerByte().
- public final int getFixedCharWidth() {
- return codeset.getMaxBytesPerChar();
- }
-
- public final int getNumChars() {
- return resultingNumChars;
- }
-
- public char[] getChars(byte[] bytes, int offset, int numBytes) {
-
- // Possible optimization of reading directly from the CDR
- // byte buffer. The sun.io converter supposedly can handle
- // incremental conversions in which a char is broken across
- // two convert calls.
- //
- // Basic tests didn't show more than a 1 ms increase
- // worst case. It's less than a factor of 2 increase.
- // Also makes the interface more difficult.
-
-
- try {
-
- ByteBuffer byteBuf = ByteBuffer.wrap(bytes, offset, numBytes);
- CharBuffer charBuf = btc.decode(byteBuf);
-
- // CharBuffer returned by the decoder will set its limit
- // to byte immediately after the last written byte.
- resultingNumChars = charBuf.limit();
-
- // IMPORTANT - It's possible the underlying char[] in the
- // CharBuffer returned by btc.decode(byteBuf)
- // is longer in length than the number of characters
- // decoded. Hence, the check below to ensure the
- // char[] returned contains all the chars that have
- // been decoded and no more.
- if (charBuf.limit() == charBuf.capacity()) {
- buffer = charBuf.array();
- } else {
- buffer = new char[charBuf.limit()];
- charBuf.get(buffer, 0, charBuf.limit()).position(0);
- }
-
- return buffer;
-
- } catch (IllegalStateException ile) {
- // There were a decoding operation already in progress
- throw wrapper.btcConverterFailure( ile ) ;
- } catch (MalformedInputException mie) {
- // There were illegal Unicode char pairs
- throw wrapper.badUnicodePair( mie ) ;
- } catch (UnmappableCharacterException uce) {
- // A character doesn't map to the desired code set.
- // CORBA formal 00-11-03.
- throw omgWrapper.charNotInCodeset( uce ) ;
- } catch (CharacterCodingException cce) {
- // If this happens, then a character decoding error occured.
- throw wrapper.btcConverterFailure( cce ) ;
- }
- }
-
- /**
- * Utility method to find a CharsetDecoder in the
- * cache or create a new one if necessary. Throws an
- * INTERNAL if the code set is unknown.
- */
- protected CharsetDecoder getConverter(String javaCodeSetName) {
-
- CharsetDecoder result = null;
- try {
- result = cache.getByteToCharConverter(javaCodeSetName);
-
- if (result == null) {
- Charset tmpCharset = Charset.forName(javaCodeSetName);
- result = tmpCharset.newDecoder();
- cache.setConverter(javaCodeSetName, result);
- }
-
- } catch(IllegalCharsetNameException icne) {
- // This can only happen if one of our charset entries has
- // an illegal name.
- throw wrapper.invalidBtcConverterName( icne, javaCodeSetName ) ;
- }
-
- return result;
- }
- }
-
- /**
- * Special converter for UTF16 since it's required to optionally
- * support a byte order marker while the internal Java converters
- * either require it or require that it isn't there.
- *
- * The solution is to check for the byte order marker, and if we
- * need to do something differently, switch internal converters.
- */
- private class UTF16BTCConverter extends JavaBTCConverter
- {
- private boolean defaultToLittleEndian;
- private boolean converterUsesBOM = true;
-
- private static final char UTF16_BE_MARKER = (char) 0xfeff;
- private static final char UTF16_LE_MARKER = (char) 0xfffe;
-
- // When there isn't a byte order marker, used the byte
- // order specified.
- public UTF16BTCConverter(boolean defaultToLittleEndian) {
- super(OSFCodeSetRegistry.UTF_16);
-
- this.defaultToLittleEndian = defaultToLittleEndian;
- }
-
- public char[] getChars(byte[] bytes, int offset, int numBytes) {
-
- if (hasUTF16ByteOrderMarker(bytes, offset, numBytes)) {
- if (!converterUsesBOM)
- switchToConverter(OSFCodeSetRegistry.UTF_16);
-
- converterUsesBOM = true;
-
- return super.getChars(bytes, offset, numBytes);
- } else {
- if (converterUsesBOM) {
- if (defaultToLittleEndian)
- switchToConverter(OSFCodeSetRegistry.UTF_16LE);
- else
- switchToConverter(OSFCodeSetRegistry.UTF_16BE);
-
- converterUsesBOM = false;
- }
-
- return super.getChars(bytes, offset, numBytes);
- }
- }
-
- /**
- * Utility method for determining if a UTF-16 byte order marker is present.
- */
- private boolean hasUTF16ByteOrderMarker(byte[] array, int offset, int length) {
- // If there aren't enough bytes to represent the marker and data,
- // return false.
- if (length >= 4) {
-
- int b1 = array[offset] & 0x00FF;
- int b2 = array[offset + 1] & 0x00FF;
-
- char marker = (char)((b1 << 8) | (b2 << 0));
-
- return (marker == UTF16_BE_MARKER || marker == UTF16_LE_MARKER);
- } else
- return false;
- }
-
- /**
- * The current solution for dealing with UTF-16 in CORBA
- * is that if our sun.io converter requires byte order markers,
- * and then we see a CORBA wstring/wchar without them, we
- * switch to the sun.io converter that doesn't require them.
- */
- private void switchToConverter(OSFCodeSetRegistry.Entry newCodeSet) {
-
- // Use the getConverter method from our superclass.
- btc = super.getConverter(newCodeSet.getName());
- }
- }
-
- /**
- * CTB converter factory for single byte or variable length encodings.
- */
- public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset) {
- int alignment = (!codeset.isFixedWidth() ?
- 1 :
- codeset.getMaxBytesPerChar());
-
- return new JavaCTBConverter(codeset, alignment);
- }
-
- /**
- * CTB converter factory for multibyte (mainly fixed) encodings.
- *
- * Because of the awkwardness with byte order markers and the possibility of
- * using UCS-2, you must specify both the endianness of the stream as well as
- * whether or not to use byte order markers if applicable. UCS-2 has no byte
- * order markers. UTF-16 has optional markers.
- *
- * If you select useByteOrderMarkers, there is no guarantee that the encoding
- * will use the endianness specified.
- *
- */
- public CTBConverter getCTBConverter(OSFCodeSetRegistry.Entry codeset,
- boolean littleEndian,
- boolean useByteOrderMarkers) {
-
- // UCS2 doesn't have byte order markers, and we're encoding it
- // as UTF-16 since UCS2 isn't available in all Java platforms.
- // They should be identical with only minor differences in
- // negative cases.
- if (codeset == OSFCodeSetRegistry.UCS_2)
- return new UTF16CTBConverter(littleEndian);
-
- // We can write UTF-16 with or without a byte order marker.
- if (codeset == OSFCodeSetRegistry.UTF_16) {
- if (useByteOrderMarkers)
- return new UTF16CTBConverter();
- else
- return new UTF16CTBConverter(littleEndian);
- }
-
- // Everything else uses the generic JavaCTBConverter.
- //
- // Variable width encodings are aligned on 1 byte boundaries.
- // A fixed width encoding with a max. of 4 bytes/char should
- // align on a 4 byte boundary. Note that UTF-16 is a special
- // case because of the optional byte order marker, so it's
- // handled above.
- //
- // This doesn't matter for GIOP 1.2 wchars and wstrings
- // since the encoded bytes are treated as an encapsulation.
- int alignment = (!codeset.isFixedWidth() ?
- 1 :
- codeset.getMaxBytesPerChar());
-
- return new JavaCTBConverter(codeset, alignment);
- }
-
- /**
- * BTCConverter factory for single byte or variable width encodings.
- */
- public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset) {
- return new JavaBTCConverter(codeset);
- }
-
- /**
- * BTCConverter factory for fixed width multibyte encodings.
- */
- public BTCConverter getBTCConverter(OSFCodeSetRegistry.Entry codeset,
- boolean defaultToLittleEndian) {
-
- if (codeset == OSFCodeSetRegistry.UTF_16 ||
- codeset == OSFCodeSetRegistry.UCS_2) {
-
- return new UTF16BTCConverter(defaultToLittleEndian);
- } else {
- return new JavaBTCConverter(codeset);
- }
- }
-
- /**
- * Follows the code set negotiation algorithm in CORBA formal 99-10-07 13.7.2.
- *
- * Returns the proper negotiated OSF character encoding number or
- * CodeSetConversion.FALLBACK_CODESET.
- */
- private int selectEncoding(CodeSetComponentInfo.CodeSetComponent client,
- CodeSetComponentInfo.CodeSetComponent server) {
-
- // A "null" value for the server's nativeCodeSet means that
- // the server desired not to indicate one. We'll take that
- // to mean that it wants the first thing in its conversion list.
- // If it's conversion list is empty, too, then use the fallback
- // codeset.
- int serverNative = server.nativeCodeSet;
-
- if (serverNative == 0) {
- if (server.conversionCodeSets.length > 0)
- serverNative = server.conversionCodeSets[0];
- else
- return CodeSetConversion.FALLBACK_CODESET;
- }
-
- if (client.nativeCodeSet == serverNative) {
- // Best case -- client and server don't have to convert
- return serverNative;
- }
-
- // Is this client capable of converting to the server's
- // native code set?
- for (int i = 0; i < client.conversionCodeSets.length; i++) {
- if (serverNative == client.conversionCodeSets[i]) {
- // The client will convert to the server's
- // native code set.
- return serverNative;
- }
- }
-
- // Is the server capable of converting to the client's
- // native code set?
- for (int i = 0; i < server.conversionCodeSets.length; i++) {
- if (client.nativeCodeSet == server.conversionCodeSets[i]) {
- // The server will convert to the client's
- // native code set.
- return client.nativeCodeSet;
- }
- }
-
- // See if there are any code sets that both the server and client
- // support (giving preference to the server). The order
- // of conversion sets is from most to least desired.
- for (int i = 0; i < server.conversionCodeSets.length; i++) {
- for (int y = 0; y < client.conversionCodeSets.length; y++) {
- if (server.conversionCodeSets[i] == client.conversionCodeSets[y]) {
- return server.conversionCodeSets[i];
- }
- }
- }
-
- // Before using the fallback codesets, the spec calls for a
- // compatibility check on the native code sets. It doesn't make
- // sense because loss free communication is always possible with
- // UTF8 and UTF16, the fall back code sets. It's also a lot
- // of work to implement. In the case of incompatibility, the
- // spec says to throw a CODESET_INCOMPATIBLE exception.
-
- // Use the fallback
- return CodeSetConversion.FALLBACK_CODESET;
- }
-
- /**
- * Perform the code set negotiation algorithm and come up with
- * the two encodings to use.
- */
- public CodeSetComponentInfo.CodeSetContext negotiate(CodeSetComponentInfo client,
- CodeSetComponentInfo server) {
- int charData
- = selectEncoding(client.getCharComponent(),
- server.getCharComponent());
-
- if (charData == CodeSetConversion.FALLBACK_CODESET) {
- charData = OSFCodeSetRegistry.UTF_8.getNumber();
- }
-
- int wcharData
- = selectEncoding(client.getWCharComponent(),
- server.getWCharComponent());
-
- if (wcharData == CodeSetConversion.FALLBACK_CODESET) {
- wcharData = OSFCodeSetRegistry.UTF_16.getNumber();
- }
-
- return new CodeSetComponentInfo.CodeSetContext(charData,
- wcharData);
- }
-
- // No one should instantiate a CodeSetConversion but the singleton
- // instance method
- private CodeSetConversion() {}
-
- // initialize-on-demand holder
- private static class CodeSetConversionHolder {
- static final CodeSetConversion csc = new CodeSetConversion() ;
- }
-
- /**
- * CodeSetConversion is a singleton, and this is the access point.
- */
- public final static CodeSetConversion impl() {
- return CodeSetConversionHolder.csc ;
- }
-
- // Singleton instance
- private static CodeSetConversion implementation;
-
- // Number used internally to indicate the fallback code
- // set.
- private static final int FALLBACK_CODESET = 0;
-
- // Provides a thread local cache for the sun.io
- // converters.
- private CodeSetCache cache = new CodeSetCache();
- }