Created
July 18, 2012 15:33
-
-
Save Baztoune/3136953 to your computer and use it in GitHub Desktop.
Search for a city, based on (french) zipcode and name, with Levenshtein implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.commons.lang.StringUtils; | |
public class MyLevenshteinUtils { | |
/** | |
* http://en.wikibooks.org/wiki/Algorithm_Implementation/ | |
* Strings/Levenshtein_distance#Java | |
* | |
* @param str1 | |
* @param str2 | |
* @return | |
*/ | |
public static int computeLevenshteinDistance(CharSequence str1, | |
CharSequence str2) { | |
int[][] distance = new int[str1.length() + 1][str2.length() + 1]; | |
for (int i = 0; i <= str1.length(); i++) | |
distance[i][0] = i; | |
for (int j = 0; j <= str2.length(); j++) | |
distance[0][j] = j; | |
for (int i = 1; i <= str1.length(); i++) | |
for (int j = 1; j <= str2.length(); j++) | |
distance[i][j] = minimum( | |
distance[i - 1][j] + 1, | |
distance[i][j - 1] + 1, | |
distance[i - 1][j - 1] | |
+ ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 | |
: 1)); | |
return distance[str1.length()][str2.length()]; | |
} | |
/** | |
* Levenshtein-Damerau | |
* | |
* @param str1 | |
* @param str2 | |
* @return | |
*/ | |
public static int computeLevenshteinDamerauDistance(CharSequence str1, | |
CharSequence str2) { | |
int[][] distance = new int[str1.length() + 1][str2.length() + 1]; | |
for (int i = 0; i <= str1.length(); i++) { | |
distance[i][0] = i; | |
} | |
for (int j = 0; j <= str2.length(); j++) { | |
distance[0][j] = j; | |
} | |
for (int i = 1; i <= str1.length(); i++) { | |
for (int j = 1; j <= str2.length(); j++) { | |
int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1; | |
distance[i][j] = minimum(distance[i - 1][j] + 1, | |
distance[i][j - 1] + 1, distance[i - 1][j - 1] + cost); | |
if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2) | |
&& str1.charAt(i - 2) == str2.charAt(j - 1)) { | |
distance[i][j] = Math.min(distance[i][j], | |
distance[i - 2][j - 2] + cost); | |
} | |
} | |
} | |
return distance[str1.length()][str2.length()]; | |
} | |
private static int minimum(int a, int b, int c) { | |
return Math.min(Math.min(a, b), c); | |
} | |
public static Double getCustomScore(String s1, String s2) { | |
if (StringUtils.isBlank(s1) || StringUtils.isBlank(s2)) { | |
return 0.0; | |
} | |
return 1.0 | |
- (double) LevenshteinUtils.computeLevenshteinDamerauDistance( | |
s1, s2) / Math.max(s1.length(), s2.length()); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@SuppressWarnings("unchecked") | |
@Override | |
protected LevenshteinResultBean handleSearchRefGeoByLevenshtein( | |
String codePostal, String ville) throws Exception { | |
// result | |
RefGeo refGeo = null; | |
Double score = null; | |
// utils | |
StringBuilder qlString = null; | |
Query query = null; | |
Session delegate = (Session) emanager.getDelegate(); | |
Double levenshteinScoreMin = (Double) InitialContext | |
.doLookup("myApp/levenshteinScoreMin"); | |
/* | |
* Recherche classique par codePostal + ville en ignorant caste et | |
* accents | |
*/ | |
qlString = new StringBuilder(); | |
qlString.append("from ") | |
.append(RefGeo.class.getName()) | |
.append(" g") | |
.append(" where 1=1 ") | |
.append(" and codePostal = :codePostal ") | |
.append(" and upper(convert(ville, 'US7ASCII')) like upper(convert(:ville, 'US7ASCII'))"); | |
query = delegate.createQuery(qlString.toString()); | |
query.setString("codePostal", codePostal); | |
query.setString("ville", ville); | |
List<RefGeo> results = (List<RefGeo>) query.list(); | |
if (results != null && results.size() > 0) { | |
// premier, même si plusieurs résultats | |
refGeo = results.get(0); | |
} else { | |
// non trouvé, recherche meilleure correspondance | |
qlString = new StringBuilder(); | |
qlString.append("from ").append(RefGeo.class.getName()) | |
.append(" g").append(" where 1=1 ") | |
.append(" and codePostal = :codePostal ") | |
.append(" order by g.ville asc"); | |
results = (List<RefGeo>) delegate.createQuery(qlString.toString()) | |
.setString("codePostal", codePostal).list(); | |
if (results != null && results.size() == 1) { | |
// une seule correspondance (ex : 75001, Paris 1er) | |
refGeo = results.get(0); | |
} else if (results != null && results.size() > 0) { | |
// plusieurs ville pour ce code postal | |
score = 0.0; | |
String s1 = MyStringUtils.toUnaccentuatedUpperCase(ville) | |
.replace("-", " ").replace("'", " "); | |
String s2; | |
/* Recherche du meilleur score via algo de Levenshtein */ | |
for (RefGeo g : results) { | |
s2 = MyStringUtils | |
.toUnaccentuatedUpperCase(g.getVille()) | |
.replace("-", " ").replace("'", " "); | |
Double thisScore = MyLevenshteinUtils.getCustomScore(s1, s2); | |
if (thisScore > levenshteinScoreMin && thisScore > score) { | |
refGeo = g; | |
score = thisScore; | |
} | |
log.debug( | |
"Recherché : {0} ({1}), Comparé : {2} ({3}), Score : {4}", | |
ville, s1, g.getVille(), s2, thisScore); | |
} | |
} else { | |
// code postal inexistant | |
log.error("Code postal (" | |
+ codePostal | |
+ ") non trouvé dans le référentiel. Recherche par ville (" | |
+ ville + ") sans code postal."); | |
/* Recherche par ville (tolérant aux majuscules et accents) */ | |
qlString = new StringBuilder(); | |
qlString.append("from ") | |
.append(RefGeo.class.getName()) | |
.append(" g") | |
.append(" where 1=1 ") | |
.append(" and upper(convert(ville, 'US7ASCII')) like upper(convert(:ville, 'US7ASCII'))") | |
.append(" order by g.codePostal"); | |
query = delegate.createQuery(qlString.toString()); | |
query.setString("ville", ville); | |
results = (List<RefGeo>) query.list(); | |
if (results != null && results.size() > 0) { | |
// le premier, par ordre de code postal | |
refGeo = results.get(0); | |
} else { | |
log.error("Ville (" + ville | |
+ ") non trouvée dans le référentiel"); | |
} | |
} | |
} | |
return new LevenshteinResultBean(refGeo, score); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.text.Normalizer; | |
import org.apache.commons.lang.StringUtils; | |
public class MyStringUtils { | |
public static String toUnaccentuatedUpperCase(String sIn) { | |
String sOut = null; | |
if (StringUtils.isNotBlank(sIn)) { | |
sOut = Normalizer.normalize(sIn, Normalizer.Form.NFD).replaceAll( | |
"\\p{InCombiningDiacriticalMarks}+", ""); | |
sOut = StringUtils.upperCase(sOut); | |
} | |
return sOut; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment