1. /*
  2. * Copyright 2001-2004 The Apache Software Foundation.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.apache.commons.codec.language;
  17. import org.apache.commons.codec.EncoderException;
  18. import org.apache.commons.codec.StringEncoder;
  19. /**
  20. * Encodes a string into a metaphone value.
  21. * <p>
  22. * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
  23. * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
  24. * </p>
  25. * <p>
  26. * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
  27. * 39.</CITE>
  28. * </p>
  29. *
  30. * @author Apache Software Foundation
  31. * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
  32. */
  33. public class Metaphone implements StringEncoder {
  34. /**
  35. * Five values in the English language
  36. */
  37. private String vowels = "AEIOU" ;
  38. /**
  39. * Variable used in Metaphone algorithm
  40. */
  41. private String frontv = "EIY" ;
  42. /**
  43. * Variable used in Metaphone algorithm
  44. */
  45. private String varson = "CSPTG" ;
  46. /**
  47. * The max code length for metaphone is 4
  48. */
  49. private int maxCodeLen = 4 ;
  50. /**
  51. * Creates an instance of the Metaphone encoder
  52. */
  53. public Metaphone() {
  54. super();
  55. }
  56. /**
  57. * Find the metaphone value of a String. This is similar to the
  58. * soundex algorithm, but better at finding similar sounding words.
  59. * All input is converted to upper case.
  60. * Limitations: Input format is expected to be a single ASCII word
  61. * with only characters in the A - Z range, no punctuation or numbers.
  62. *
  63. * @param txt String to find the metaphone code for
  64. * @return A metaphone code corresponding to the String supplied
  65. */
  66. public String metaphone(String txt) {
  67. boolean hard = false ;
  68. if ((txt == null) || (txt.length() == 0)) {
  69. return "" ;
  70. }
  71. // single character is itself
  72. if (txt.length() == 1) {
  73. return txt.toUpperCase() ;
  74. }
  75. char[] inwd = txt.toUpperCase().toCharArray() ;
  76. StringBuffer local = new StringBuffer(40); // manipulate
  77. StringBuffer code = new StringBuffer(10) ; // output
  78. // handle initial 2 characters exceptions
  79. switch(inwd[0]) {
  80. case 'K' :
  81. case 'G' :
  82. case 'P' : /* looking for KN, etc*/
  83. if (inwd[1] == 'N') {
  84. local.append(inwd, 1, inwd.length - 1);
  85. } else {
  86. local.append(inwd);
  87. }
  88. break;
  89. case 'A': /* looking for AE */
  90. if (inwd[1] == 'E') {
  91. local.append(inwd, 1, inwd.length - 1);
  92. } else {
  93. local.append(inwd);
  94. }
  95. break;
  96. case 'W' : /* looking for WR or WH */
  97. if (inwd[1] == 'R') { // WR -> R
  98. local.append(inwd, 1, inwd.length - 1);
  99. break ;
  100. }
  101. if (inwd[1] == 'H') {
  102. local.append(inwd, 1, inwd.length - 1);
  103. local.setCharAt(0, 'W'); // WH -> W
  104. } else {
  105. local.append(inwd);
  106. }
  107. break;
  108. case 'X' : /* initial X becomes S */
  109. inwd[0] = 'S';
  110. local.append(inwd);
  111. break ;
  112. default :
  113. local.append(inwd);
  114. } // now local has working string with initials fixed
  115. int wdsz = local.length();
  116. int n = 0 ;
  117. while ((code.length() < this.getMaxCodeLen()) &&
  118. (n < wdsz) ) { // max code size of 4 works well
  119. char symb = local.charAt(n) ;
  120. // remove duplicate letters except C
  121. if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
  122. n++ ;
  123. } else { // not dup
  124. switch(symb) {
  125. case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
  126. if (n == 0) {
  127. code.append(symb);
  128. }
  129. break ; // only use vowel if leading char
  130. case 'B' :
  131. if ( isPreviousChar(local, n, 'M') &&
  132. isLastChar(wdsz, n) ) { // B is silent if word ends in MB
  133. break;
  134. }
  135. code.append(symb);
  136. break;
  137. case 'C' : // lots of C special cases
  138. /* discard if SCI, SCE or SCY */
  139. if ( isPreviousChar(local, n, 'S') &&
  140. !isLastChar(wdsz, n) &&
  141. (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
  142. break;
  143. }
  144. if (regionMatch(local, n, "CIA")) { // "CIA" -> X
  145. code.append('X');
  146. break;
  147. }
  148. if (!isLastChar(wdsz, n) &&
  149. (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
  150. code.append('S');
  151. break; // CI,CE,CY -> S
  152. }
  153. if (isPreviousChar(local, n, 'S') &&
  154. isNextChar(local, n, 'H') ) { // SCH->sk
  155. code.append('K') ;
  156. break ;
  157. }
  158. if (isNextChar(local, n, 'H')) { // detect CH
  159. if ((n == 0) &&
  160. (wdsz >= 3) &&
  161. isVowel(local,2) ) { // CH consonant -> K consonant
  162. code.append('K');
  163. } else {
  164. code.append('X'); // CHvowel -> X
  165. }
  166. } else {
  167. code.append('K');
  168. }
  169. break ;
  170. case 'D' :
  171. if (!isLastChar(wdsz, n + 1) &&
  172. isNextChar(local, n, 'G') &&
  173. (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
  174. code.append('J'); n += 2 ;
  175. } else {
  176. code.append('T');
  177. }
  178. break ;
  179. case 'G' : // GH silent at end or before consonant
  180. if (isLastChar(wdsz, n + 1) &&
  181. isNextChar(local, n, 'H')) {
  182. break;
  183. }
  184. if (!isLastChar(wdsz, n + 1) &&
  185. isNextChar(local,n,'H') &&
  186. !isVowel(local,n+2)) {
  187. break;
  188. }
  189. if ((n > 0) &&
  190. ( regionMatch(local, n, "GN") ||
  191. regionMatch(local, n, "GNED") ) ) {
  192. break; // silent G
  193. }
  194. if (isPreviousChar(local, n, 'G')) {
  195. hard = true ;
  196. } else {
  197. hard = false ;
  198. }
  199. if (!isLastChar(wdsz, n) &&
  200. (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
  201. (!hard)) {
  202. code.append('J');
  203. } else {
  204. code.append('K');
  205. }
  206. break ;
  207. case 'H':
  208. if (isLastChar(wdsz, n)) {
  209. break ; // terminal H
  210. }
  211. if ((n > 0) &&
  212. (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
  213. break;
  214. }
  215. if (isVowel(local,n+1)) {
  216. code.append('H'); // Hvowel
  217. }
  218. break;
  219. case 'F':
  220. case 'J' :
  221. case 'L' :
  222. case 'M':
  223. case 'N' :
  224. case 'R' :
  225. code.append(symb);
  226. break;
  227. case 'K' :
  228. if (n > 0) { // not initial
  229. if (!isPreviousChar(local, n, 'C')) {
  230. code.append(symb);
  231. }
  232. } else {
  233. code.append(symb); // initial K
  234. }
  235. break ;
  236. case 'P' :
  237. if (isNextChar(local,n,'H')) {
  238. // PH -> F
  239. code.append('F');
  240. } else {
  241. code.append(symb);
  242. }
  243. break ;
  244. case 'Q' :
  245. code.append('K');
  246. break;
  247. case 'S' :
  248. if (regionMatch(local,n,"SH") ||
  249. regionMatch(local,n,"SIO") ||
  250. regionMatch(local,n,"SIA")) {
  251. code.append('X');
  252. } else {
  253. code.append('S');
  254. }
  255. break;
  256. case 'T' :
  257. if (regionMatch(local,n,"TIA") ||
  258. regionMatch(local,n,"TIO")) {
  259. code.append('X');
  260. break;
  261. }
  262. if (regionMatch(local,n,"TCH")) {
  263. // Silent if in "TCH"
  264. break;
  265. }
  266. // substitute numeral 0 for TH (resembles theta after all)
  267. if (regionMatch(local,n,"TH")) {
  268. code.append('0');
  269. } else {
  270. code.append('T');
  271. }
  272. break ;
  273. case 'V' :
  274. code.append('F'); break ;
  275. case 'W' : case 'Y' : // silent if not followed by vowel
  276. if (!isLastChar(wdsz,n) &&
  277. isVowel(local,n+1)) {
  278. code.append(symb);
  279. }
  280. break ;
  281. case 'X' :
  282. code.append('K'); code.append('S');
  283. break ;
  284. case 'Z' :
  285. code.append('S'); break ;
  286. } // end switch
  287. n++ ;
  288. } // end else from symb != 'C'
  289. if (code.length() > this.getMaxCodeLen()) {
  290. code.setLength(this.getMaxCodeLen());
  291. }
  292. }
  293. return code.toString();
  294. }
  295. private boolean isVowel(StringBuffer string, int index) {
  296. return (this.vowels.indexOf(string.charAt(index)) >= 0);
  297. }
  298. private boolean isPreviousChar(StringBuffer string, int index, char c) {
  299. boolean matches = false;
  300. if( index > 0 &&
  301. index < string.length() ) {
  302. matches = string.charAt(index - 1) == c;
  303. }
  304. return matches;
  305. }
  306. private boolean isNextChar(StringBuffer string, int index, char c) {
  307. boolean matches = false;
  308. if( index >= 0 &&
  309. index < string.length() - 1 ) {
  310. matches = string.charAt(index + 1) == c;
  311. }
  312. return matches;
  313. }
  314. private boolean regionMatch(StringBuffer string, int index, String test) {
  315. boolean matches = false;
  316. if( index >= 0 &&
  317. (index + test.length() - 1) < string.length() ) {
  318. String substring = string.substring( index, index + test.length());
  319. matches = substring.equals( test );
  320. }
  321. return matches;
  322. }
  323. private boolean isLastChar(int wdsz, int n) {
  324. return n + 1 == wdsz;
  325. }
  326. /**
  327. * Encodes an Object using the metaphone algorithm. This method
  328. * is provided in order to satisfy the requirements of the
  329. * Encoder interface, and will throw an EncoderException if the
  330. * supplied object is not of type java.lang.String.
  331. *
  332. * @param pObject Object to encode
  333. * @return An object (or type java.lang.String) containing the
  334. * metaphone code which corresponds to the String supplied.
  335. * @throws EncoderException if the parameter supplied is not
  336. * of type java.lang.String
  337. */
  338. public Object encode(Object pObject) throws EncoderException {
  339. if (!(pObject instanceof java.lang.String)) {
  340. throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
  341. }
  342. return metaphone((String) pObject);
  343. }
  344. /**
  345. * Encodes a String using the Metaphone algorithm.
  346. *
  347. * @param pString String object to encode
  348. * @return The metaphone code corresponding to the String supplied
  349. */
  350. public String encode(String pString) {
  351. return metaphone(pString);
  352. }
  353. /**
  354. * Tests is the metaphones of two strings are identical.
  355. *
  356. * @param str1 First of two strings to compare
  357. * @param str2 Second of two strings to compare
  358. * @return true if the metaphones of these strings are identical,
  359. * false otherwise.
  360. */
  361. public boolean isMetaphoneEqual(String str1, String str2) {
  362. return metaphone(str1).equals(metaphone(str2));
  363. }
  364. /**
  365. * Returns the maxCodeLen.
  366. * @return int
  367. */
  368. public int getMaxCodeLen() { return this.maxCodeLen; }
  369. /**
  370. * Sets the maxCodeLen.
  371. * @param maxCodeLen The maxCodeLen to set
  372. */
  373. public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
  374. }