- /*
- * @(#)CharacterBreakData.java 1.14 00/01/19
- *
- * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
- *
- * This software is the proprietary information of Sun Microsystems, Inc.
- * Use is subject to license terms.
- *
- */
-
- /*
- * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
- * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
- *
- * The original version of this source code and documentation
- * is copyrighted and owned by Taligent, Inc., a wholly-owned
- * subsidiary of IBM. These materials are provided under terms
- * of a License Agreement between Taligent and Sun. This technology
- * is protected by multiple US and International patents.
- *
- * This notice and attribution to Taligent may not be removed.
- * Taligent is a registered trademark of Taligent, Inc.
- *
- */
-
-
- package java.text;
-
- /**
- * The CharacterBreakData contains data used by SimpleTextBoundary
- * to determine character breaks.
- * @see #BreakIterator
- */
- final class CharacterBreakData extends TextBoundaryData
- {
- // THEORY OF OPERATION: This class contains all the tables necessary to do
- // character-break iteration. This class descends from TextBoundaryData, which
- // is abstract. This class doesn't define any non-static members; it inherits the
- // non-static members from TextBoundaryData and fills them in with pointers to
- // the static members defined here.
- // There are two main parts to a TextBoundaryData object: the state-transition
- // tables and the character-mapping tables. The forward state table defines the
- // transitions for a deterministic finite state machine that locates character
- // boundaries. The rows are the states and the columns are character categories.
- // The cell values consist of two parts: The first is the row number of the next
- // state to transition to, or a "stop" value (0). (Because 0 is the stop value
- // rather than a valid state number, row 0 of the array isn't ever looked at; we
- // fill it with STOP values by convention.) The second part is a flag indicating
- // whether the iterator should update its break position on this transition. When
- // the flag is set, the sign bit of the value is turned on (SI is used to represent
- // the flag bit being turned on-- we do it this way rather than just using negative
- // numbers because we still need to see the SI flag when the value of the transition
- // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
- // is 1.
- // The backward state table works the same way as the forward state table, but is
- // usually simplified. The iterator uses the backward state table only to find a
- // "safe place" to start iterating forward. It then seeks forward from the "safe
- // place" to the actual break position using the forward table. A "safe place" is
- // a spot in the text that is guaranteed to be a break position.
- // The character-category mapping tables are split into several pieces, one for
- // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
- // character categories to the character categories used by this break iterator.
- // The index of the array is the Unicode category number as returned by
- // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
- // indicating whether all the characters in the Unicode category have the
- // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
- // an entry is true, then we find the right category using... 3) The kExceptionChar
- // table. This table is a sorted list of SpecialMapping objects. Each entry defines
- // a range of contiguous characters that share the same category and the category
- // number. This list is binary-searched to find an entry corresponding to the
- // charactre being mapped. Only characters whose breaking category is different from
- // the raw-mapping value (the breaking category for their Unicode category) are
- // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
- // in the Latin1 range. This table maps straight from a character value to a
- // category number, bypassing all the other tables. The programmer must take care
- // that all of the different category-mapping tables are consistent.
- // In the current implementation, all of these tables are created and maintained
- // by hand, not using a tool.
-
- // constant names for the category numbers
- private static final byte accent_diacritic = 0; // all Unicode non-spacing marks
- private static final byte baseForm = 1; // everything that isn't accounted for elsewhere
- private static final byte baseCR = 2; // the ASCII carriage return
- private static final byte baseLF = 3; // all other line/paragraph separators
- private static final byte choseong = 4; // Korean initial consonant
- private static final byte jungseong = 5; // Korean vowel
- private static final byte jongseong = 6; // Korean final consonant
- private static final byte EOS = 7; // end of string
- private static final int COL_COUNT = 8; // the number of items in this list (and therefore,
- // the number of columns in the state tables)
-
- private static final byte SI = (byte)0x80;
- private static final byte STOP = (byte) 0;
- private static final byte SI_STOP = (byte)SI + STOP;
-
- public CharacterBreakData() {
- super(kCharacterForwardTable, kCharacterBackwardTable, kCharacterMap);
- }
-
- // This table locates logical character ("grapheme") boundaries. A logical
- // character is a sequence of Unicode code-point values that are seen as a single
- // character by the user. This table implements the following logic:
- // 1) Unless otherwise mentioned, each individual code point is a character.
- // 2) A regular character followed by one or more Unicode non-spacing marks is
- // treated as a single character.
- // 3) The CR-LF sequence is treated as a single character.
- // 4) A Hangul syllable spelled out with individual jamos is treated as a single
- // character, according to the rules specified under "Conjoining Jamo Behavior"
- // in the Unicode standard.
- // UTF-16 surrogate pairs are NOT trated as single characters in this version of the
- // character-breaking tables. Rule 1 is implemented by state 2, rule 2 is implemented
- // by rules 3 and 7 (line/paragraph separators are NOT kept together with any non-
- // spacing marks that follow them!). Rule 4 is implemented with states 4, 5, and 6.
- private static final byte kCharacterForwardData[] =
- {
- // acct base cr lf
- // cho jung jong EOS
- STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP,
-
- // 1 - main dispatch state
- (byte)(SI+2), (byte)(SI+2), (byte)(SI+3), (byte)(SI+7),
- (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
-
- // 2 - if the character is regular base or accent, we end up in this
- // state, which eats accents until it sees something else
- (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
-
- // 3 - a CR character causes a transition. If the next character is
- // an LF, it transitions to state 7; otherwise, it does exactly
- // the same thing as state 7
- SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7),
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
-
- // 4 - this state eats Korean initial consonants and uses
- // states 5 and 6 to take care of the other parts of the syllable
- (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
- (byte)(SI+4), (byte)(SI+5), (byte)(SI+6), SI_STOP,
-
- // 5 - this state eats Korean vowels
- (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(SI+5), (byte)(SI+6), SI_STOP,
-
- // 6 - this state eats Korean final consonants
- (byte)(SI+2), SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, (byte)(SI+6), SI_STOP,
-
- // 7 - This state is reached when an LF or other line separator
- // is seen. It eats the LF and stops.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP
- };
- private static final WordBreakTable kCharacterForwardTable =
- new WordBreakTable(COL_COUNT, kCharacterForwardData);
-
- // This table implements the backward-seeking logic. Here, we merely
- // eat characters until we see a Hangul syllable-initial consonant,
- // an ASCII carriage return, a "base" character (most characters), or
- // the end of the string. These characters all represent unambiguous
- // break positions.
- private static final byte kCharacterBackwardData[] =
- {
- // acct base cr lf
- // cho jung jong EOS
- STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP,
-
- // 1
- (byte)(SI+1), SI_STOP, SI_STOP, (byte)(SI+1),
- SI_STOP, (byte)(SI+1), (byte)(SI+1), SI_STOP
- };
-
- private static final WordBreakTable kCharacterBackwardTable =
- new WordBreakTable(COL_COUNT, kCharacterBackwardData);
- private static final int kRawMapping[] =
- {
- baseForm, //UNASSIGNED = 0,
- baseForm, //UPPERCASE_LETTER = 1,
- baseForm, //LOWERCASE_LETTER = 2,
- baseForm, //TITLECASE_LETTER = 3,
- baseForm, //MODIFIER_LETTER = 4,
- baseForm, //OTHER_LETTER = 5,
- accent_diacritic, //NON_SPACING_MARK = 6,
- accent_diacritic, //ENCLOSING_MARK = 7,
- baseForm, //COMBINING_SPACING_MARK = 8,
- baseForm, //DECIMAL_DIGIT_NUMBER = 9,
- baseForm, //LETTER_NUMBER = 10,
- baseForm, //OTHER_NUMBER = 11,
- baseForm, //SPACE_SEPARATOR = 12,
- baseForm, //LINE_SEPARATOR = 13,
- baseForm, //PARAGRAPH_SEPARATOR = 14,
- baseForm, //CONTROL = 15,
- baseForm, //FORMAT = 16,
- baseForm, //???? = 17,
- baseForm, //PRIVATE_USE = 18,
- baseForm, //SURROGATE = 19,
- baseForm, //DASH_PUNCTUATION = 20,
- baseForm, //START_PUNCTUATION = 21,
- baseForm, //END_PUNCTUATION = 22,
- baseForm, //CONNECTOR_PUNCTUATION = 23,
- baseForm, //OTHER_PUNCTUATION = 24,
- baseForm, //MATH_SYMBOL = 25,
- baseForm, //CURRENCY_SYMBOL = 26,
- baseForm, //MODIFIER_SYMBOL = 27,
- baseForm, //OTHER_SYMBOL = 28;
- };
-
- private static final SpecialMapping kExceptionChar[] = //{};
- {
- new SpecialMapping(ASCII_LINEFEED, baseLF),
- new SpecialMapping(ASCII_CARRIAGE_RETURN, baseCR),
- new SpecialMapping(HANGUL_CHOSEONG_LOW, HANGUL_CHOSEONG_HIGH, choseong),
- new SpecialMapping(HANGUL_JUNGSEONG_LOW, HANGUL_JUNGSEONG_HIGH, jungseong),
- new SpecialMapping(HANGUL_JONGSEONG_LOW, HANGUL_JONGSEONG_HIGH, jongseong),
- new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, PUNCTUATION_PARAGRAPH_SEPARATOR, baseLF),
- new SpecialMapping(END_OF_STRING, EOS)
- };
-
- private static final boolean CharacterExceptionFlags[] = {
- false, // kNonCharacter = 0,
- false, // kUppercaseLetter = 1,
- false, // kLowercaseLetter = 2,
- false, // kTitlecaseLetter = 3,
- false, // kModifierLetter = 4,
- true, // kOtherLetter = 5,
- false, // kNonSpacingMark = 6,
- false, // kEnclosingMark = 7,
- false, // kCombiningSpacingMark = 8,
- false, // kDecimalNumber = 9,
- false, // kLetterNumber = 10,
- false, // kOtherNumber = 11,
- false, // kSpaceSeparator = 12,
- true, // kLineSeparator = 13,
- true, // kParagraphSeparator = 14,
- true, // kControlCharacter = 15,
- false, // kFormatCharacter = 16,
- false, // UNDEFINED = 17,
- false, // kPrivateUseCharacter = 18,
- false, // kSurrogate = 19,
- false, // kDashPunctuation = 20,
- false, // kOpenPunctuation = 21,
- false, // kClosePunctuation = 22,
- false, // kConnectorPunctuation = 23,
- false, // kOtherPunctuation = 24,
- false, // kMathSymbol = 25,
- false, // kCurrencySymbol = 26,
- false, // kModifierSymbol = 27,
- false // kOtherSymbol = 28
- };
-
- private static final int kCharacterAsciiValues[] = {
- // null soh stx etx eot enq ask bell
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // bs ht lf vt ff cr so si
- baseForm, baseForm, baseLF, baseForm, baseForm, baseCR, baseForm, baseForm,
- // dle dc1 dc2 dc3 dc4 nak syn etb
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // can em sub esc fs gs rs us
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // sp ! " # $ % & '
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // ( ) * + , - . /
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // 0 1 2 3 4 5 6 7
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // 8 9 : ; < = > ?
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // @ A B C D E F G
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // H I J K L M N O
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // P Q R S T U V W
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // X Y Z [ \ ] ^ _
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // ` a b c d e f g
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // h i j k l m n o
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // p q r s t u v w
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // x y z { | } ~ del
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // nbsp inv-! cents pounds currency yen broken-bar section
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // umlaut copyright super-a gui-left not soft-hyph registered macron
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // degree +/- super-2 super-3 acute micro paragraph bullet
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
- // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
- baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm
- };
-
- private static final UnicodeClassMapping kCharacterMap
- = new UnicodeClassMapping(kRawMapping, kExceptionChar, CharacterExceptionFlags,
- kCharacterAsciiValues);
- }
-