1. /*
  2. * Copyright 2001-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.commons.codec.language;
  17. import org.apache.commons.codec.EncoderException;
  18. import org.apache.commons.codec.StringEncoder;
  19. /**
  20. * Encodes a string into a double metaphone value.
  21. * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
  22. * <ul>
  23. * <li>Original Article: <a
  24. * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
  25. * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
  26. * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
  27. * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
  28. * </ul>
  29. *
  30. * @author Apache Software Foundation
  31. * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
  32. */
  33. public class DoubleMetaphone implements StringEncoder {
  34. /**
  35. * "Vowels" to test for
  36. */
  37. private static final String VOWELS = "AEIOUY";
  38. /**
  39. * Prefixes when present which are not pronounced
  40. */
  41. private static final String[] SILENT_START =
  42. { "GN", "KN", "PN", "WR", "PS" };
  43. private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
  44. { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
  45. private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
  46. { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
  47. private static final String[] L_T_K_S_N_M_B_Z =
  48. { "L", "T", "K", "S", "N", "M", "B", "Z" };
  49. /**
  50. * Maximum length of an encoding, default is 4
  51. */
  52. protected int maxCodeLen = 4;
  53. /**
  54. * Creates an instance of this DoubleMetaphone encoder
  55. */
  56. public DoubleMetaphone() {
  57. super();
  58. }
  59. /**
  60. * Encode a value with Double Metaphone
  61. *
  62. * @param value String to encode
  63. * @return an encoded string
  64. */
  65. public String doubleMetaphone(String value) {
  66. return doubleMetaphone(value, false);
  67. }
  68. /**
  69. * Encode a value with Double Metaphone, optionally using the alternate
  70. * encoding.
  71. *
  72. * @param value String to encode
  73. * @param alternate use alternate encode
  74. * @return an encoded string
  75. */
  76. public String doubleMetaphone(String value, boolean alternate) {
  77. value = cleanInput(value);
  78. if (value == null) {
  79. return null;
  80. }
  81. boolean slavoGermanic = isSlavoGermanic(value);
  82. int index = isSilentStart(value) ? 1 : 0;
  83. DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
  84. while (!result.isComplete() && index <= value.length() - 1) {
  85. switch (value.charAt(index)) {
  86. case 'A':
  87. case 'E':
  88. case 'I':
  89. case 'O':
  90. case 'U':
  91. case 'Y':
  92. index = handleAEIOUY(value, result, index);
  93. break;
  94. case 'B':
  95. result.append('P');
  96. index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
  97. break;
  98. case '\u00C7':
  99. // A C with a Cedilla
  100. result.append('S');
  101. index++;
  102. break;
  103. case 'C':
  104. index = handleC(value, result, index);
  105. break;
  106. case 'D':
  107. index = handleD(value, result, index);
  108. break;
  109. case 'F':
  110. result.append('F');
  111. index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
  112. break;
  113. case 'G':
  114. index = handleG(value, result, index, slavoGermanic);
  115. break;
  116. case 'H':
  117. index = handleH(value, result, index);
  118. break;
  119. case 'J':
  120. index = handleJ(value, result, index, slavoGermanic);
  121. break;
  122. case 'K':
  123. result.append('K');
  124. index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
  125. break;
  126. case 'L':
  127. index = handleL(value, result, index);
  128. break;
  129. case 'M':
  130. result.append('M');
  131. index = conditionM0(value, index) ? index + 2 : index + 1;
  132. break;
  133. case 'N':
  134. result.append('N');
  135. index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
  136. break;
  137. case '\u00D1':
  138. // N with a tilde (spanish ene)
  139. result.append('N');
  140. index++;
  141. break;
  142. case 'P':
  143. index = handleP(value, result, index);
  144. break;
  145. case 'Q':
  146. result.append('K');
  147. index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
  148. break;
  149. case 'R':
  150. index = handleR(value, result, index, slavoGermanic);
  151. break;
  152. case 'S':
  153. index = handleS(value, result, index, slavoGermanic);
  154. break;
  155. case 'T':
  156. index = handleT(value, result, index);
  157. break;
  158. case 'V':
  159. result.append('F');
  160. index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
  161. break;
  162. case 'W':
  163. index = handleW(value, result, index);
  164. break;
  165. case 'X':
  166. index = handleX(value, result, index);
  167. break;
  168. case 'Z':
  169. index = handleZ(value, result, index, slavoGermanic);
  170. break;
  171. default:
  172. index++;
  173. break;
  174. }
  175. }
  176. return alternate ? result.getAlternate() : result.getPrimary();
  177. }
  178. /**
  179. * Encode the value using DoubleMetaphone. It will only work if
  180. * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
  181. *
  182. * @param obj Object to encode (should be of type String)
  183. * @return An encoded Object (will be of type String)
  184. * @throws EncoderException encode parameter is not of type String
  185. */
  186. public Object encode(Object obj) throws EncoderException {
  187. if (!(obj instanceof String)) {
  188. throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
  189. }
  190. return doubleMetaphone((String) obj);
  191. }
  192. /**
  193. * Encode the value using DoubleMetaphone.
  194. *
  195. * @param value String to encode
  196. * @return An encoded String
  197. */
  198. public String encode(String value) {
  199. return doubleMetaphone(value);
  200. }
  201. /**
  202. * Check if the Double Metaphone values of two <code>String</code> values
  203. * are equal.
  204. *
  205. * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
  206. * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
  207. * @return <code>true</code> if the encoded <code>String</code>s are equal;
  208. * <code>false</code> otherwise.
  209. * @see #isDoubleMetaphoneEqual(String,String,boolean)
  210. */
  211. public boolean isDoubleMetaphoneEqual(String value1, String value2) {
  212. return isDoubleMetaphoneEqual(value1, value2, false);
  213. }
  214. /**
  215. * Check if the Double Metaphone values of two <code>String</code> values
  216. * are equal, optionally using the alternate value.
  217. *
  218. * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
  219. * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
  220. * @param alternate use the alternate value if <code>true</code>.
  221. * @return <code>true</code> if the encoded <code>String</code>s are equal;
  222. * <code>false</code> otherwise.
  223. */
  224. public boolean isDoubleMetaphoneEqual(String value1,
  225. String value2,
  226. boolean alternate) {
  227. return doubleMetaphone(value1, alternate).equals(doubleMetaphone
  228. (value2, alternate));
  229. }
  230. /**
  231. * Returns the maxCodeLen.
  232. * @return int
  233. */
  234. public int getMaxCodeLen() {
  235. return this.maxCodeLen;
  236. }
  237. /**
  238. * Sets the maxCodeLen.
  239. * @param maxCodeLen The maxCodeLen to set
  240. */
  241. public void setMaxCodeLen(int maxCodeLen) {
  242. this.maxCodeLen = maxCodeLen;
  243. }
  244. //-- BEGIN HANDLERS --//
  245. /**
  246. * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
  247. */
  248. private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
  249. index) {
  250. if (index == 0) {
  251. result.append('A');
  252. }
  253. return index + 1;
  254. }
  255. /**
  256. * Handles 'C' cases
  257. */
  258. private int handleC(String value,
  259. DoubleMetaphoneResult result,
  260. int index) {
  261. if (conditionC0(value, index)) { // very confusing, moved out
  262. result.append('K');
  263. index += 2;
  264. } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
  265. result.append('S');
  266. index += 2;
  267. } else if (contains(value, index, 2, "CH")) {
  268. index = handleCH(value, result, index);
  269. } else if (contains(value, index, 2, "CZ") &&
  270. !contains(value, index - 2, 4, "WICZ")) {
  271. //-- "Czerny" --//
  272. result.append('S', 'X');
  273. index += 2;
  274. } else if (contains(value, index + 1, 3, "CIA")) {
  275. //-- "focaccia" --//
  276. result.append('X');
  277. index += 3;
  278. } else if (contains(value, index, 2, "CC") &&
  279. !(index == 1 && charAt(value, 0) == 'M')) {
  280. //-- double "cc" but not "McClelland" --//
  281. return handleCC(value, result, index);
  282. } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
  283. result.append('K');
  284. index += 2;
  285. } else if (contains(value, index, 2, "CI", "CE", "CY")) {
  286. //-- Italian vs. English --//
  287. if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
  288. result.append('S', 'X');
  289. } else {
  290. result.append('S');
  291. }
  292. index += 2;
  293. } else {
  294. result.append('K');
  295. if (contains(value, index + 1, 2, " C", " Q", " G")) {
  296. //-- Mac Caffrey, Mac Gregor --//
  297. index += 3;
  298. } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
  299. !contains(value, index + 1, 2, "CE", "CI")) {
  300. index += 2;
  301. } else {
  302. index++;
  303. }
  304. }
  305. return index;
  306. }
  307. /**
  308. * Handles 'CC' cases
  309. */
  310. private int handleCC(String value,
  311. DoubleMetaphoneResult result,
  312. int index) {
  313. if (contains(value, index + 2, 1, "I", "E", "H") &&
  314. !contains(value, index + 2, 2, "HU")) {
  315. //-- "bellocchio" but not "bacchus" --//
  316. if ((index == 1 && charAt(value, index - 1) == 'A') ||
  317. contains(value, index - 1, 5, "UCCEE", "UCCES")) {
  318. //-- "accident", "accede", "succeed" --//
  319. result.append("KS");
  320. } else {
  321. //-- "bacci", "bertucci", other Italian --//
  322. result.append('X');
  323. }
  324. index += 3;
  325. } else { // Pierce's rule
  326. result.append('K');
  327. index += 2;
  328. }
  329. return index;
  330. }
  331. /**
  332. * Handles 'CH' cases
  333. */
  334. private int handleCH(String value,
  335. DoubleMetaphoneResult result,
  336. int index) {
  337. if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael
  338. result.append('K', 'X');
  339. return index + 2;
  340. } else if (conditionCH0(value, index)) {
  341. //-- Greek roots ("chemistry", "chorus", etc.) --//
  342. result.append('K');
  343. return index + 2;
  344. } else if (conditionCH1(value, index)) {
  345. //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
  346. result.append('K');
  347. return index + 2;
  348. } else {
  349. if (index > 0) {
  350. if (contains(value, 0, 2, "MC")) {
  351. result.append('K');
  352. } else {
  353. result.append('X', 'K');
  354. }
  355. } else {
  356. result.append('X');
  357. }
  358. return index + 2;
  359. }
  360. }
  361. /**
  362. * Handles 'D' cases
  363. */
  364. private int handleD(String value,
  365. DoubleMetaphoneResult result,
  366. int index) {
  367. if (contains(value, index, 2, "DG")) {
  368. //-- "Edge" --//
  369. if (contains(value, index + 2, 1, "I", "E", "Y")) {
  370. result.append('J');
  371. index += 3;
  372. //-- "Edgar" --//
  373. } else {
  374. result.append("TK");
  375. index += 2;
  376. }
  377. } else if (contains(value, index, 2, "DT", "DD")) {
  378. result.append('T');
  379. index += 2;
  380. } else {
  381. result.append('T');
  382. index++;
  383. }
  384. return index;
  385. }
  386. /**
  387. * Handles 'G' cases
  388. */
  389. private int handleG(String value,
  390. DoubleMetaphoneResult result,
  391. int index,
  392. boolean slavoGermanic) {
  393. if (charAt(value, index + 1) == 'H') {
  394. index = handleGH(value, result, index);
  395. } else if (charAt(value, index + 1) == 'N') {
  396. if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
  397. result.append("KN", "N");
  398. } else if (!contains(value, index + 2, 2, "EY") &&
  399. charAt(value, index + 1) != 'Y' && !slavoGermanic) {
  400. result.append("N", "KN");
  401. } else {
  402. result.append("KN");
  403. }
  404. index = index + 2;
  405. } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
  406. result.append("KL", "L");
  407. index += 2;
  408. } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
  409. //-- -ges-, -gep-, -gel-, -gie- at beginning --//
  410. result.append('K', 'J');
  411. index += 2;
  412. } else if ((contains(value, index + 1, 2, "ER") ||
  413. charAt(value, index + 1) == 'Y') &&
  414. !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
  415. !contains(value, index - 1, 1, "E", "I") &&
  416. !contains(value, index - 1, 3, "RGY", "OGY")) {
  417. //-- -ger-, -gy- --//
  418. result.append('K', 'J');
  419. index += 2;
  420. } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
  421. contains(value, index - 1, 4, "AGGI", "OGGI")) {
  422. //-- Italian "biaggi" --//
  423. if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
  424. //-- obvious germanic --//
  425. result.append('K');
  426. } else if (contains(value, index + 1, 4, "IER")) {
  427. result.append('J');
  428. } else {
  429. result.append('J', 'K');
  430. }
  431. index += 2;
  432. } else if (charAt(value, index + 1) == 'G') {
  433. index += 2;
  434. result.append('K');
  435. } else {
  436. index++;
  437. result.append('K');
  438. }
  439. return index;
  440. }
  441. /**
  442. * Handles 'GH' cases
  443. */
  444. private int handleGH(String value,
  445. DoubleMetaphoneResult result,
  446. int index) {
  447. if (index > 0 && !isVowel(charAt(value, index - 1))) {
  448. result.append('K');
  449. index += 2;
  450. } else if (index == 0) {
  451. if (charAt(value, index + 2) == 'I') {
  452. result.append('J');
  453. } else {
  454. result.append('K');
  455. }
  456. index += 2;
  457. } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
  458. (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
  459. (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
  460. //-- Parker's rule (with some further refinements) - "hugh"
  461. index += 2;
  462. } else {
  463. if (index > 2 && charAt(value, index - 1) == 'U' &&
  464. contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
  465. //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
  466. result.append('F');
  467. } else if (index > 0 && charAt(value, index - 1) != 'I') {
  468. result.append('K');
  469. }
  470. index += 2;
  471. }
  472. return index;
  473. }
  474. /**
  475. * Handles 'H' cases
  476. */
  477. private int handleH(String value,
  478. DoubleMetaphoneResult result,
  479. int index) {
  480. //-- only keep if first & before vowel or between 2 vowels --//
  481. if ((index == 0 || isVowel(charAt(value, index - 1))) &&
  482. isVowel(charAt(value, index + 1))) {
  483. result.append('H');
  484. index += 2;
  485. //-- also takes car of "HH" --//
  486. } else {
  487. index++;
  488. }
  489. return index;
  490. }
  491. /**
  492. * Handles 'J' cases
  493. */
  494. private int handleJ(String value, DoubleMetaphoneResult result, int index,
  495. boolean slavoGermanic) {
  496. if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
  497. //-- obvious Spanish, "Jose", "San Jacinto" --//
  498. if ((index == 0 && (charAt(value, index + 4) == ' ') ||
  499. value.length() == 4) || contains(value, 0, 4, "SAN ")) {
  500. result.append('H');
  501. } else {
  502. result.append('J', 'H');
  503. }
  504. index++;
  505. } else {
  506. if (index == 0 && !contains(value, index, 4, "JOSE")) {
  507. result.append('J', 'A');
  508. } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
  509. (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
  510. result.append('J', 'H');
  511. } else if (index == value.length() - 1) {
  512. result.append('J', ' ');
  513. } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
  514. result.append('J');
  515. }
  516. if (charAt(value, index + 1) == 'J') {
  517. index += 2;
  518. } else {
  519. index++;
  520. }
  521. }
  522. return index;
  523. }
  524. /**
  525. * Handles 'L' cases
  526. */
  527. private int handleL(String value,
  528. DoubleMetaphoneResult result,
  529. int index) {
  530. result.append('L');
  531. if (charAt(value, index + 1) == 'L') {
  532. if (conditionL0(value, index)) {
  533. result.appendAlternate(' ');
  534. }
  535. index += 2;
  536. } else {
  537. index++;
  538. }
  539. return index;
  540. }
  541. /**
  542. * Handles 'P' cases
  543. */
  544. private int handleP(String value,
  545. DoubleMetaphoneResult result,
  546. int index) {
  547. if (charAt(value, index + 1) == 'H') {
  548. result.append('F');
  549. index += 2;
  550. } else {
  551. result.append('P');
  552. index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
  553. }
  554. return index;
  555. }
  556. /**
  557. * Handles 'R' cases
  558. */
  559. private int handleR(String value,
  560. DoubleMetaphoneResult result,
  561. int index,
  562. boolean slavoGermanic) {
  563. if (index == value.length() - 1 && !slavoGermanic &&
  564. contains(value, index - 2, 2, "IE") &&
  565. !contains(value, index - 4, 2, "ME", "MA")) {
  566. result.appendAlternate('R');
  567. } else {
  568. result.append('R');
  569. }
  570. return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
  571. }
  572. /**
  573. * Handles 'S' cases
  574. */
  575. private int handleS(String value,
  576. DoubleMetaphoneResult result,
  577. int index,
  578. boolean slavoGermanic) {
  579. if (contains(value, index - 1, 3, "ISL", "YSL")) {
  580. //-- special cases "island", "isle", "carlisle", "carlysle" --//
  581. index++;
  582. } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
  583. //-- special case "sugar-" --//
  584. result.append('X', 'S');
  585. index++;
  586. } else if (contains(value, index, 2, "SH")) {
  587. if (contains(value, index + 1, 4,
  588. "HEIM", "HOEK", "HOLM", "HOLZ")) {
  589. //-- germanic --//
  590. result.append('S');
  591. } else {
  592. result.append('X');
  593. }
  594. index += 2;
  595. } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
  596. //-- Italian and Armenian --//
  597. if (slavoGermanic) {
  598. result.append('S');
  599. } else {
  600. result.append('S', 'X');
  601. }
  602. index += 3;
  603. } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
  604. //-- german & anglicisations, e.g. "smith" match "schmidt" //
  605. // "snider" match "schneider" --//
  606. //-- also, -sz- in slavic language altho in hungarian it //
  607. // is pronounced "s" --//
  608. result.append('S', 'X');
  609. index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
  610. } else if (contains(value, index, 2, "SC")) {
  611. index = handleSC(value, result, index);
  612. } else {
  613. if (index == value.length() - 1 && contains(value, index - 2,
  614. 2, "AI", "OI")){
  615. //-- french e.g. "resnais", "artois" --//
  616. result.appendAlternate('S');
  617. } else {
  618. result.append('S');
  619. }
  620. index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
  621. }
  622. return index;
  623. }
  624. /**
  625. * Handles 'SC' cases
  626. */
  627. private int handleSC(String value,
  628. DoubleMetaphoneResult result,
  629. int index) {
  630. if (charAt(value, index + 2) == 'H') {
  631. //-- Schlesinger's rule --//
  632. if (contains(value, index + 3,
  633. 2, "OO", "ER", "EN", "UY", "ED", "EM")) {
  634. //-- Dutch origin, e.g. "school", "schooner" --//
  635. if (contains(value, index + 3, 2, "ER", "EN")) {
  636. //-- "schermerhorn", "schenker" --//
  637. result.append("X", "SK");
  638. } else {
  639. result.append("SK");
  640. }
  641. } else {
  642. if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
  643. result.append('X', 'S');
  644. } else {
  645. result.append('X');
  646. }
  647. }
  648. } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
  649. result.append('S');
  650. } else {
  651. result.append("SK");
  652. }
  653. return index + 3;
  654. }
  655. /**
  656. * Handles 'T' cases
  657. */
  658. private int handleT(String value,
  659. DoubleMetaphoneResult result,
  660. int index) {
  661. if (contains(value, index, 4, "TION")) {
  662. result.append('X');
  663. index += 3;
  664. } else if (contains(value, index, 3, "TIA", "TCH")) {
  665. result.append('X');
  666. index += 3;
  667. } else if (contains(value, index, 2, "TH") || contains(value, index,
  668. 3, "TTH")) {
  669. if (contains(value, index + 2, 2, "OM", "AM") ||
  670. //-- special case "thomas", "thames" or germanic --//
  671. contains(value, 0, 4, "VAN ", "VON ") ||
  672. contains(value, 0, 3, "SCH")) {
  673. result.append('T');
  674. } else {
  675. result.append('0', 'T');
  676. }
  677. index += 2;
  678. } else {
  679. result.append('T');
  680. index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
  681. }
  682. return index;
  683. }
  684. /**
  685. * Handles 'W' cases
  686. */
  687. private int handleW(String value,
  688. DoubleMetaphoneResult result,
  689. int index) {
  690. if (contains(value, index, 2, "WR")) {
  691. //-- can also be in middle of word --//
  692. result.append('R');
  693. index += 2;
  694. } else {
  695. if (index == 0 && (isVowel(charAt(value, index + 1)) ||
  696. contains(value, index, 2, "WH"))) {
  697. if (isVowel(charAt(value, index + 1))) {
  698. //-- Wasserman should match Vasserman --//
  699. result.append('A', 'F');
  700. } else {
  701. //-- need Uomo to match Womo --//
  702. result.append('A');
  703. }
  704. index++;
  705. } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
  706. contains(value, index - 1,
  707. 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
  708. contains(value, 0, 3, "SCH")) {
  709. //-- Arnow should match Arnoff --//
  710. result.appendAlternate('F');
  711. index++;
  712. } else if (contains(value, index, 4, "WICZ", "WITZ")) {
  713. //-- Polish e.g. "filipowicz" --//
  714. result.append("TS", "FX");
  715. index += 4;
  716. } else {
  717. index++;
  718. }
  719. }
  720. return index;
  721. }
  722. /**
  723. * Handles 'X' cases
  724. */
  725. private int handleX(String value,
  726. DoubleMetaphoneResult result,
  727. int index) {
  728. if (index == 0) {
  729. result.append('S');
  730. index++;
  731. } else {
  732. if (!((index == value.length() - 1) &&
  733. (contains(value, index - 3, 3, "IAU", "EAU") ||
  734. contains(value, index - 2, 2, "AU", "OU")))) {
  735. //-- French e.g. breaux --//
  736. result.append("KS");
  737. }
  738. index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
  739. }
  740. return index;
  741. }
  742. /**
  743. * Handles 'Z' cases
  744. */
  745. private int handleZ(String value, DoubleMetaphoneResult result, int index,
  746. boolean slavoGermanic) {
  747. if (charAt(value, index + 1) == 'H') {
  748. //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
  749. result.append('J');
  750. index += 2;
  751. } else {
  752. if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
  753. result.append("S", "TS");
  754. } else {
  755. result.append('S');
  756. }
  757. index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
  758. }
  759. return index;
  760. }
  761. //-- BEGIN CONDITIONS --//
  762. /**
  763. * Complex condition 0 for 'C'
  764. */
  765. private boolean conditionC0(String value, int index) {
  766. if (contains(value, index, 4, "CHIA")) {
  767. return true;
  768. } else if (index <= 1) {
  769. return false;
  770. } else if (isVowel(charAt(value, index - 2))) {
  771. return false;
  772. } else if (!contains(value, index - 1, 3, "ACH")) {
  773. return false;
  774. } else {
  775. char c = charAt(value, index + 2);
  776. return (c != 'I' && c != 'E')
  777. || contains(value, index - 2, 6, "BACHER", "MACHER");
  778. }
  779. }
  780. /**
  781. * Complex condition 0 for 'CH'
  782. */
  783. private boolean conditionCH0(String value, int index) {
  784. if (index != 0) {
  785. return false;
  786. } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
  787. !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
  788. return false;
  789. } else if (contains(value, 0, 5, "CHORE")) {
  790. return false;
  791. } else {
  792. return true;
  793. }
  794. }
  795. /**
  796. * Complex condition 1 for 'CH'
  797. */
  798. private boolean conditionCH1(String value, int index) {
  799. return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
  800. 3, "SCH")) ||
  801. contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
  802. contains(value, index + 2, 1, "T", "S") ||
  803. ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
  804. (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
  805. }
  806. /**
  807. * Complex condition 0 for 'L'
  808. */
  809. private boolean conditionL0(String value, int index) {
  810. if (index == value.length() - 3 &&
  811. contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
  812. return true;
  813. } else if ((contains(value, index - 1, 2, "AS", "OS") ||
  814. contains(value, value.length() - 1, 1, "A", "O")) &&
  815. contains(value, index - 1, 4, "ALLE")) {
  816. return true;
  817. } else {
  818. return false;
  819. }
  820. }
  821. /**
  822. * Complex condition 0 for 'M'
  823. */
  824. private boolean conditionM0(String value, int index) {
  825. if (charAt(value, index + 1) == 'M') {
  826. return true;
  827. }
  828. return contains(value, index - 1, 3, "UMB")
  829. && ((index + 1) == value.length() - 1 || contains(value,
  830. index + 2, 2, "ER"));
  831. }
  832. //-- BEGIN HELPER FUNCTIONS --//
  833. /**
  834. * Determines whether or not a value is of slavo-germanic orgin. A value is
  835. * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
  836. */
  837. private boolean isSlavoGermanic(String value) {
  838. return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
  839. value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
  840. }
  841. /**
  842. * Determines whether or not a character is a vowel or not
  843. */
  844. private boolean isVowel(char ch) {
  845. return VOWELS.indexOf(ch) != -1;
  846. }
  847. /**
  848. * Determines whether or not the value starts with a silent letter. It will
  849. * return <code>true</code> if the value starts with any of 'GN', 'KN',
  850. * 'PN', 'WR' or 'PS'.
  851. */
  852. private boolean isSilentStart(String value) {
  853. boolean result = false;
  854. for (int i = 0; i < SILENT_START.length; i++) {
  855. if (value.startsWith(SILENT_START[i])) {
  856. result = true;
  857. break;
  858. }
  859. }
  860. return result;
  861. }
  862. /**
  863. * Cleans the input
  864. */
  865. private String cleanInput(String input) {
  866. if (input == null) {
  867. return null;
  868. }
  869. input = input.trim();
  870. if (input.length() == 0) {
  871. return null;
  872. }
  873. return input.toUpperCase();
  874. }
  875. /**
  876. * Gets the character at index <code>index</code> if available, otherwise
  877. * it returns <code>Character.MIN_VALUE</code> so that there is some sort
  878. * of a default
  879. */
  880. protected char charAt(String value, int index) {
  881. if (index < 0 || index >= value.length()) {
  882. return Character.MIN_VALUE;
  883. }
  884. return value.charAt(index);
  885. }
  886. /**
  887. * Shortcut method with 1 criteria
  888. */
  889. private static boolean contains(String value, int start, int length,
  890. String criteria) {
  891. return contains(value, start, length,
  892. new String[] { criteria });
  893. }
  894. /**
  895. * Shortcut method with 2 criteria
  896. */
  897. private static boolean contains(String value, int start, int length,
  898. String criteria1, String criteria2) {
  899. return contains(value, start, length,
  900. new String[] { criteria1, criteria2 });
  901. }
  902. /**
  903. * Shortcut method with 3 criteria
  904. */
  905. private static boolean contains(String value, int start, int length,
  906. String criteria1, String criteria2,
  907. String criteria3) {
  908. return contains(value, start, length,
  909. new String[] { criteria1, criteria2, criteria3 });
  910. }
  911. /**
  912. * Shortcut method with 4 criteria
  913. */
  914. private static boolean contains(String value, int start, int length,
  915. String criteria1, String criteria2,
  916. String criteria3, String criteria4) {
  917. return contains(value, start, length,
  918. new String[] { criteria1, criteria2, criteria3,
  919. criteria4 });
  920. }
  921. /**
  922. * Shortcut method with 5 criteria
  923. */
  924. private static boolean contains(String value, int start, int length,
  925. String criteria1, String criteria2,
  926. String criteria3, String criteria4,
  927. String criteria5) {
  928. return contains(value, start, length,
  929. new String[] { criteria1, criteria2, criteria3,
  930. criteria4, criteria5 });
  931. }
  932. /**
  933. * Shortcut method with 6 criteria
  934. */
  935. private static boolean contains(String value, int start, int length,
  936. String criteria1, String criteria2,
  937. String criteria3, String criteria4,
  938. String criteria5, String criteria6) {
  939. return contains(value, start, length,
  940. new String[] { criteria1, criteria2, criteria3,
  941. criteria4, criteria5, criteria6 });
  942. }
  943. /**
  944. * Determines whether <code>value</code> contains any of the criteria
  945. starting
  946. * at index <code>start</code> and matching up to length <code>length</code>
  947. */
  948. protected static boolean contains(String value, int start, int length,
  949. String[] criteria) {
  950. boolean result = false;
  951. if (start >= 0 && start + length <= value.length()) {
  952. String target = value.substring(start, start + length);
  953. for (int i = 0; i < criteria.length; i++) {
  954. if (target.equals(criteria[i])) {
  955. result = true;
  956. break;
  957. }
  958. }
  959. }
  960. return result;
  961. }
  962. //-- BEGIN INNER CLASSES --//
  963. /**
  964. * Inner class for storing results, since there is the optional alternate
  965. * encoding.
  966. */
  967. public class DoubleMetaphoneResult {
  968. private StringBuffer primary = new StringBuffer(getMaxCodeLen());
  969. private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
  970. private int maxLength;
  971. public DoubleMetaphoneResult(int maxLength) {
  972. this.maxLength = maxLength;
  973. }
  974. public void append(char value) {
  975. appendPrimary(value);
  976. appendAlternate(value);
  977. }
  978. public void append(char primary, char alternate) {
  979. appendPrimary(primary);
  980. appendAlternate(alternate);
  981. }
  982. public void appendPrimary(char value) {
  983. if (this.primary.length() < this.maxLength) {
  984. this.primary.append(value);
  985. }
  986. }
  987. public void appendAlternate(char value) {
  988. if (this.alternate.length() < this.maxLength) {
  989. this.alternate.append(value);
  990. }
  991. }
  992. public void append(String value) {
  993. appendPrimary(value);
  994. appendAlternate(value);
  995. }
  996. public void append(String primary, String alternate) {
  997. appendPrimary(primary);
  998. appendAlternate(alternate);
  999. }
  1000. public void appendPrimary(String value) {
  1001. int addChars = this.maxLength - this.primary.length();
  1002. if (value.length() <= addChars) {
  1003. this.primary.append(value);
  1004. } else {
  1005. this.primary.append(value.substring(0, addChars));
  1006. }
  1007. }
  1008. public void appendAlternate(String value) {
  1009. int addChars = this.maxLength - this.alternate.length();
  1010. if (value.length() <= addChars) {
  1011. this.alternate.append(value);
  1012. } else {
  1013. this.alternate.append(value.substring(0, addChars));
  1014. }
  1015. }
  1016. public String getPrimary() {
  1017. return this.primary.toString();
  1018. }
  1019. public String getAlternate() {
  1020. return this.alternate.toString();
  1021. }
  1022. public boolean isComplete() {
  1023. return this.primary.length() >= this.maxLength &&
  1024. this.alternate.length() >= this.maxLength;
  1025. }
  1026. }
  1027. }