- /*
- * @(#)WordBreakData.java 1.16 00/01/19
- *
- * Copyright 1996-2000 Sun Microsystems, Inc. All Rights Reserved.
- *
- * This software is the proprietary information of Sun Microsystems, Inc.
- * Use is subject to license terms.
- *
- */
-
- /*
- * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
- * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
- *
- * The original version of this source code and documentation
- * is copyrighted and owned by Taligent, Inc., a wholly-owned
- * subsidiary of IBM. These materials are provided under terms
- * of a License Agreement between Taligent and Sun. This technology
- * is protected by multiple US and International patents.
- *
- * This notice and attribution to Taligent may not be removed.
- * Taligent is a registered trademark of Taligent, Inc.
- *
- */
-
- package java.text;
-
- /**
- * The WordBreakData contains data used by SimpleTextBoundary
- * to determine word breaks.
- * @see #BreakIterator
- */
- final class WordBreakData extends TextBoundaryData
- {
- // THEORY OF OPERATION: This class contains all the tables necessary to do
- // character-break iteration. This class descends from TextBoundaryData, which
- // is abstract. This class doesn't define any non-static members; it inherits the
- // non-static members from TextBoundaryData and fills them in with pointers to
- // the static members defined here.
- // There are two main parts to a TextBoundaryData object: the state-transition
- // tables and the character-mapping tables. The forward state table defines the
- // transitions for a deterministic finite state machine that locates character
- // boundaries. The rows are the states and the columns are character categories.
- // The cell values consist of two parts: The first is the row number of the next
- // state to transition to, or a "stop" value (0). (Because 0 is the stop value
- // rather than a valid state number, row 0 of the array isn't ever looked at; we
- // fill it with STOP values by convention.) The second part is a flag indicating
- // whether the iterator should update its break position on this transition. When
- // the flag is set, the sign bit of the value is turned on (SI is used to represent
- // the flag bit being turned on-- we do it this way rather than just using negative
- // numbers because we still need to see the SI flag when the value of the transition
- // is STOP. SI_STOP is used to denote this.) The starting state in all state tables
- // is 1.
- // The backward state table works the same way as the forward state table, but is
- // usually simplified. The iterator uses the backward state table only to find a
- // "safe place" to start iterating forward. It then seeks forward from the "safe
- // place" to the actual break position using the forward table. A "safe place" is
- // a spot in the text that is guaranteed to be a break position.
- // The character-category mapping tables are split into several pieces, one for
- // each stage of the category-mapping process: 1) kRawMapping maps generic Unicode
- // character categories to the character categories used by this break iterator.
- // The index of the array is the Unicode category number as returned by
- // Character.getType(). 2) The kExceptionFlags table is a table of Boolean values
- // indicating whether all the characters in the Unicode category have the
- // raw-mapping value. The rows correspond to the rows of the raw-mapping table. If
- // an entry is true, then we find the right category using... 3) The kExceptionChar
- // table. This table is a sorted list of SpecialMapping objects. Each entry defines
- // a range of contiguous characters that share the same category and the category
- // number. This list is binary-searched to find an entry corresponding to the
- // charactre being mapped. Only characters whose breaking category is different from
- // the raw-mapping value (the breaking category for their Unicode category) are
- // listed in this table. 4) The kAsciiValues table is a fast-path table for characters
- // in the Latin1 range. This table maps straight from a character value to a
- // category number, bypassing all the other tables. The programmer must take care
- // that all of the different category-mapping tables are consistent.
- // In the current implementation, all of these tables are created and maintained
- // by hand, not using a tool.
-
- private static final byte BREAK = 0; // characters not listed in any other category
- private static final byte letter = 1; // letters
- private static final byte number = 2; // digits
- private static final byte midLetter = 3;// punctuation that can occur within a word
- private static final byte midLetNum = 4;// punctuation that can occur inside a wors or a number
- private static final byte preNum = 5; // characters that may serve as a prefix to a number
- private static final byte postNum = 6; // characters that may serve as a suffix to a number
- private static final byte midNum = 7; // punctuation that can occur inside a number
- private static final byte preMidNum = 8;// punctuation that can occur either at the beginning
- // of or inside a number
- private static final byte blank = 9; // white space (other than always-break characters)
- private static final byte lf = 10; // the ASCII LF character
- private static final byte kata = 11; // Katakana
- private static final byte hira = 12; // Hiragana
- private static final byte kanji = 13; // all CJK ideographs
- private static final byte diacrit = 14; // CJK diacriticals
- private static final byte cr = 15; // the ASCII CR character
- private static final byte nsm = 16; // Unicode non-spacing marks
- private static final byte EOS = 17; // end of string
- private static final int COL_COUNT = 18;// number of categories
-
- private static final byte SI = (byte)0x80;
- private static final byte STOP = (byte) 0;
- private static final byte SI_STOP = (byte)SI + STOP;
-
- public WordBreakData() {
- super(kWordForward, kWordBackward, kWordMap);
- }
-
- // This table locates word boundaries, as this is defined for "find whole words"
- // searches and often for double-click selection. In this case, "words" are kept
- // separate from whitespace and punctuation.
- // The rules implemented here are as follows:
- // 1) Unless mentioned below, all characters are treated as "words" unto themselves
- // and have break positions on both sides (state 14)
- // 2) A "word" is kept together, and consists of a sequence of letters. Certain
- // punctuation marks, such as apostrophes and hyphens, are allowed inside a "word"
- // without causing a break, but only if they're flanked on both sides by letters.
- // (states 2 and 7)
- // 3) A "number" is kept together, and consists of an optional prefix character (such
- // as a minus, decimal point, or currency symbol), followed by a sequence of digits,
- // followed by an optional suffix character (such as a percent sign). The sequence
- // of digits may contain certain punctuation characters (such as commas and periods),
- // but only if they're flanked on both sides by digits. (states 3, 8, and 14)
- // 4) If a "number" and "word" occur in succession without any intervening characters,
- // they are kept together. This allows sequences like "$30F3" or "ascii2ebcdic" to
- // be treated as single units. (transitions between states 2 and 3)
- // 5) Sequences of whitespace are kept together. (state 6)
- // 6) The CR-LF sequence is kept together. (states 4 and 13)
- // 7) A sequence of Kanji is kept together. (state 12)
- // 8) Sequences of Hiragana and Katakana are kept together, and may include their
- // common diacritical marks. (states 10 and 11)
- // [The logic for Kanji and Kana characters is an approximation. There is no way
- // to detect real Japanese word boundaries without a dictionary.]
- // 9) Unicode non-spacing marks are completely transparent to the algorithm.
- // (see the "nsm" column)
- private static final byte kWordForwardData[] =
- {
- // brk let num mLe mLN
- // prN poN mNu pMN blk
- // lf kat hir kan dia
- // cr nsm EOS
-
- // 0 - dummy state
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP,
-
- // 1 - main dispatch state
- (byte)(SI+14), (byte)(SI+2), (byte)(SI+3), (byte)(SI+14), (byte)(SI+14),
- (byte)(SI+5), (byte)(SI+14), (byte)(SI+14), (byte)(SI+5), (byte)(SI+6),
- (byte)(SI+4), (byte)(SI+10), (byte)(SI+11), (byte)(SI+12), (byte)(SI+9),
- (byte)(SI+13), (byte)(1), SI_STOP,
-
- // 2 - This state eats letters, advances to state 3 for numbers, and
- // goes to state 7 for mid-word punctuation.
- SI_STOP, (byte)(SI+2), (byte)(SI+3), (byte)(SI+7), (byte)(SI+7),
- SI_STOP, SI_STOP, SI_STOP, (byte)(SI+7), SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(2), SI_STOP,
-
- // 3 - This state eats digits, advances to state 2 for letters, uses
- // state 8 to handle mid-number punctuation, and goes to state 14 for
- // number-suffix characters.
- SI_STOP, (byte)(SI+2), (byte)(SI+3), SI_STOP, (byte)(SI+8),
- SI_STOP, (byte)(SI+14), (byte)(SI+8), (byte)(SI+8), SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(3), SI_STOP,
-
- // 4 - This state handles LFs by eating the LF and stopping.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP,
-
- // 5 - This state handles number-prefix characters. If the next character
- // is a digit, it goes to state 3; otherwise, it stops (the character is
- // a "word" by itself).
- SI_STOP, SI_STOP, (byte)(SI+3), SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(5), SI_STOP,
-
- // 6 - This state eats whitespace and stops on everything else.
- // (Except for CRs and LFs, which are kept together with the whitespace.)
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, (byte)(SI+6),
- (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- (byte)(SI+13), (byte)(6), SI_STOP,
-
- // 7 - This state handles mid-word punctuation: If the next character is a
- // letter, we're still in the word and we keep going. Otherwise, we stop,
- // and the break was actually before this character.
- STOP, (byte)(SI+2), STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(7), STOP,
-
- // 8 - This state handles mid-number punctuation: If the next character is a
- // digit, we're still in the word and we keep going. Otherwise, we stop,
- // and the break position is actually before this character.
- STOP, STOP, (byte)(SI+3), STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(8), STOP,
-
- // 9 - This state handles CJK diacritics. It'll keep going if the next
- // character is CJK; otherwise, it stops.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(SI+10), (byte)(SI+11), SI_STOP, (byte)(SI+9),
- SI_STOP, (byte)(9), SI_STOP,
-
- // 10 - This state eats Katakana and CJK discritics.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(SI+10), SI_STOP, SI_STOP, (byte)(SI+10),
- SI_STOP, (byte)(10), SI_STOP,
-
- // 11 - This state eats Hiragana and CJK diacritics.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, (byte)(SI+11), SI_STOP, (byte)(SI+11),
- SI_STOP, (byte)(11), SI_STOP,
-
- // 12 - This state eats Kanji.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, (byte)(SI+12), SI_STOP,
- SI_STOP, (byte)(12), SI_STOP,
-
- // 13 - This state handles CRs, which are "words" unto themselves (or
- // with preceding whitespace) unless followed by an LFs.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- (byte)(SI+4), SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP,
-
- // 14 - This state handles LFs and number-suffix characters (when they
- // actually end a number) by eating the character and stopping.
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, SI_STOP, SI_STOP, SI_STOP, SI_STOP,
- SI_STOP, (byte)(14), SI_STOP
- };
- private static final WordBreakTable kWordForward =
- new WordBreakTable(COL_COUNT, kWordForwardData);
-
- // This table is a completely-reversed version of the forward table.
- private static final byte kWordBackwardData[] =
- {
- // brk let num mLe mLN
- // prN poN mNu pMN blk
- // lf kat hir kan dia
- // cr nsm EOS
-
- // 0
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP,
-
- // 1
- (byte)(SI+6), (byte)(SI+2), (byte)(SI+3), (byte)(SI+4), (byte)(SI+5),
- (byte)(SI+6), (byte)(SI+7), (byte)(SI+7), (byte)(SI+5), (byte)(SI+8),
- (byte)(SI+8), (byte)(SI+9), (byte)(SI+10), (byte)(SI+12), (byte)(SI+11),
- (byte)(SI+8), (byte)(1), STOP,
-
- // 2
- STOP, (byte)(SI+2), (byte)(SI+3), (byte)(4), (byte)(4),
- STOP, STOP, STOP, (byte)(4), STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(2), STOP,
-
- // 3
- STOP, (byte)(SI+2), (byte)(SI+3), STOP, (byte)(7),
- SI_STOP, STOP, (byte)(7), (byte)(SI+7), STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(3), STOP,
-
- // 4
- STOP, (byte)(SI+2), STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(4), STOP,
-
- // 5
- STOP, (byte)(SI+2), (byte)(SI+3), STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(5), STOP,
-
- // 6
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(6), STOP,
-
- // 7
- STOP, STOP, (byte)(SI+3), STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(7), STOP,
-
- // 8
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, (byte)(SI+8),
- (byte)(SI+8), STOP, STOP, STOP, STOP,
- (byte)(SI+8), (byte)(8), STOP,
-
- // 9
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(SI+9), STOP, STOP, (byte)(9),
- STOP, (byte)(9), STOP,
-
- // 10
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, (byte)(SI+10),STOP, (byte)(10),
- STOP, (byte)(10), STOP,
-
- // 11
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, (byte)(SI+9), (byte)(SI+10), STOP, (byte)(SI+11),
- STOP, (byte)(11), STOP,
-
- // 12
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, STOP, STOP,
- STOP, STOP, STOP, (byte)(SI+12), STOP,
- STOP, (byte)(12), STOP
- };
- private static final WordBreakTable kWordBackward =
- new WordBreakTable(COL_COUNT, kWordBackwardData);
- private static final int kRawMapping[] =
- {
- BREAK, // UNASSIGNED = 0,
- letter, // UPPERCASE_LETTER = 1,
- letter, // LOWERCASE_LETTER = 2,
- letter, // TITLECASE_LETTER = 3,
- letter, // MODIFIER_LETTER = 4,
- letter, // OTHER_LETTER = 5,
- nsm, // NON_SPACING_MARK = 6,
- nsm, // ENCLOSING_MARK = 7,
- BREAK, // COMBINING_SPACING_MARK = 8,
- number, // DECIMAL_DIGIT_NUMBER = 9,
- letter, // LETTER_NUMBER = 10,
- number, // OTHER_NUMBER = 11,
- blank, // SPACE_SEPARATOR = 12,
- BREAK, // LINE_SEPARATOR = 13,
- BREAK, // PARAGRAPH_SEPARATOR = 14,
- BREAK, // CONTROL = 15,
- BREAK, // FORMAT = 16
- BREAK, // ???? = 17,
- BREAK, // PRIVATE_USE = 18,
- BREAK, // SURROGATE = 19,
- midLetter, // DASH_PUNCTUATION = 20,
- BREAK, // START_PUNCTUATION = 21,
- BREAK, // END_PUNCTUATION = 22,
- BREAK, // CONNECTOR_PUNCTUATION = 23,
- BREAK, // OTHER_PUNCTUATION = 24,
- BREAK, // MATH_SYMBOL = 25,
- preNum, // CURRENCY_SYMBOL = 26,
- BREAK, // MODIFIER_SYMBOL = 27,
- BREAK // OTHER_SYMBOL = 28
- };
- private static final SpecialMapping kExceptionChar[] =
- {
- //note: the ranges in this table must be sorted in ascending order
- //as required by the UnicodeClassMapping class.
- new SpecialMapping(ASCII_HORIZONTAL_TABULATION, blank),
- new SpecialMapping(ASCII_LINEFEED, lf),
- new SpecialMapping(ASCII_FORM_FEED, lf),
- new SpecialMapping(ASCII_CARRIAGE_RETURN, cr),
- new SpecialMapping(ASCII_QUOTATION_MARK, midLetNum),
- new SpecialMapping(ASCII_NUMBER_SIGN, preNum),
- new SpecialMapping(ASCII_PERCENT, postNum),
- new SpecialMapping(ASCII_AMPERSAND, postNum),
- new SpecialMapping(ASCII_APOSTROPHE, midLetNum),
- new SpecialMapping(ASCII_COMMA, midNum),
- new SpecialMapping(ASCII_FULL_STOP, preMidNum),
- new SpecialMapping(ASCII_CENT_SIGN, postNum),
- new SpecialMapping(LATIN1_SOFTHYPHEN, midLetter),
- new SpecialMapping(ARABIC_PERCENT_SIGN, postNum),
- new SpecialMapping(ARABIC_DECIMAL_SEPARATOR, midNum),
- new SpecialMapping(PUNCTUATION_HYPHENATION_POINT, midLetter),
- new SpecialMapping(PUNCTUATION_LINE_SEPARATOR,
- PUNCTUATION_PARAGRAPH_SEPARATOR, lf),
- new SpecialMapping(PER_MILLE_SIGN, postNum),
- new SpecialMapping(PER_TEN_THOUSAND_SIGN, postNum),
- new SpecialMapping(IDEOGRAPHIC_ITERATION_MARK, kanji),
- new SpecialMapping(HIRAGANA_LETTER_SMALL_A, HIRAGANA_LETTER_VU, hira),
- new SpecialMapping(COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
- HIRAGANA_SEMIVOICED_SOUND_MARK, diacrit),
- new SpecialMapping(HIRAGANA_ITERATION_MARK, HIRAGANA_VOICED_ITERATION_MARK, hira),
- new SpecialMapping(KATAKANA_LETTER_SMALL_A,
- KATAKANA_LETTER_SMALL_KE, kata),
- new SpecialMapping(KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK, diacrit),
- new SpecialMapping(KATAKANA_ITERATION_MARK, KATAKANA_VOICED_ITERATION_MARK, kata),
- new SpecialMapping(UNICODE_LOW_BOUND_HAN,
- UNICODE_HIGH_BOUND_HAN, kanji),
- new SpecialMapping(HANGUL_SYL_LOW, HANGUL_SYL_HIGH, letter),
- new SpecialMapping(CJK_COMPATIBILITY_F900,
- CJK_COMPATIBILITY_FA2D, kanji),
- new SpecialMapping(END_OF_STRING, EOS)
- };
-
- private static final boolean WordExceptionFlags[] = {
- false, // kNonCharacter = 0,
- false, // kUppercaseLetter = 1,
- false, // kLowercaseLetter = 2,
- false, // kTitlecaseLetter = 3,
- true, // kModifierLetter = 4,
- true, // kOtherLetter = 5,
- true, // kNonSpacingMark = 6,
- false, // kEnclosingMark = 7,
- false, // kCombiningSpacingMark = 8,
- false, // kDecimalNumber = 9,
- false, // kLetterNumber = 10,
- false, // kOtherNumber = 11,
- false, // kSpaceSeparator = 12,
- true, // kLineSeparator = 13,
- true, // kParagraphSeparator = 14,
- true, // kControlCharacter = 15,
- false, // kFormatCharacter = 16,
- false, // UNDEFINED = 17,
- false, // kPrivateUseCharacter = 18,
- false, // kSurrogate = 19,
- true, // kDashPunctuation = 20,
- false, // kOpenPunctuation = 21,
- false, // kClosePunctuation = 22,
- false, // kConnectorPunctuation = 23,
- true, // kOtherPunctuation = 24,
- false, // kMathSymbol = 25,
- true, // kCurrencySymbol = 26,
- false, // kModifierSymbol = 27,
- false // kOtherSymbol = 28
- };
-
- private static final int kWordAsciiValues[] = {
- // null soh stx etx eot enq ask bell
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // bs ht lf vt ff cr so si
- BREAK, blank, lf, BREAK, lf, cr, BREAK, BREAK,
- // dle dc1 dc2 dc3 dc4 nak syn etb
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // can em sub esc fs gs rs us
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // sp ! " # $ % & '
- blank, BREAK, midLetNum, preNum, preNum, postNum, postNum, midLetNum,
- // ( ) * + , - . /
- BREAK, BREAK, BREAK, BREAK, midNum, midLetter, preMidNum, BREAK,
- // 0 1 2 3 4 5 6 7
- number, number, number, number, number, number, number, number,
- // 8 9 : ; < = > ?
- number, number, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // @ A B C D E F G
- BREAK, letter, letter, letter, letter, letter, letter, letter,
- // H I J K L M N O
- letter, letter, letter, letter, letter, letter, letter, letter,
- // P Q R S T U V W
- letter, letter, letter, letter, letter, letter, letter, letter,
- // X Y Z [ \ ] ^ _
- letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
- // ` a b c d e f g
- BREAK, letter, letter, letter, letter, letter, letter, letter,
- // h i j k l m n o
- letter, letter, letter, letter, letter, letter, letter, letter,
- // p q r s t u v w
- letter, letter, letter, letter, letter, letter, letter, letter,
- // x y z { | } ~ del
- letter, letter, letter, BREAK, BREAK, BREAK, BREAK, BREAK,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // ctrl ctrl ctrl ctrl ctrl ctrl ctrl ctrl
- BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK, BREAK,
- // nbsp inv-! cents pounds currency yen broken-bar section
- blank, BREAK, postNum, preNum, preNum, preNum, BREAK, BREAK,
- // umlaut copyright super-a gui-left not soft-hyph registered macron
- BREAK, BREAK, letter, BREAK, BREAK, midLetter, BREAK, BREAK,
- // degree +/- super-2 super-3 acute micro paragraph bullet
- BREAK, BREAK, number, number, BREAK, letter, BREAK, BREAK,
- // cedilla super-1 super-o gui-right 1/4 1/2 3/4 inv-?
- BREAK, letter, BREAK, BREAK, number, number, number, BREAK,
- // A-grave A-acute A-hat A-tilde A-umlaut A-ring AE C-cedilla
- letter, letter, letter, letter, letter, letter, letter, letter,
- // E-grave E-acute E-hat E-umlaut I-grave I-acute I-hat I-umlaut
- letter, letter, letter, letter, letter, letter, letter, letter,
- // Edh N-tilde O-grave O-acute O-hat O-tilde O-umlaut times
- letter, letter, letter, letter, letter, letter, letter, BREAK,
- // O-slash U-grave U-acute U-hat U-umlaut Y-acute Thorn ess-zed
- letter, letter, letter, letter, letter, letter, letter, letter,
- // a-grave a-acute a-hat a-tilde a-umlaut a-ring ae c-cedilla
- letter, letter, letter, letter, letter, letter, letter, letter,
- // e-grave e-acute e-hat e-umlaut i-grave i-acute i-hat i-umlaut
- letter, letter, letter, letter, letter, letter, letter, letter,
- // edh n-tilde o-grave o-acute o-hat o-tilde o-umlaut over
- letter, letter, letter, letter, letter, letter, letter, BREAK,
- // o-slash u-grave u-acute u-hat u-umlaut y-acute thorn y-umlaut
- letter, letter, letter, letter, letter, letter, letter, letter
- };
-
- private static final UnicodeClassMapping kWordMap
- = new UnicodeClassMapping(kRawMapping, kExceptionChar, WordExceptionFlags,
- kWordAsciiValues);
- }
-