Skip to content

Instantly share code, notes, and snippets.

@Baztoune
Created July 18, 2012 15:33
Show Gist options
  • Save Baztoune/3136953 to your computer and use it in GitHub Desktop.
Save Baztoune/3136953 to your computer and use it in GitHub Desktop.
Search for a city, based on (french) zipcode and name, with Levenshtein implementation
import org.apache.commons.lang.StringUtils;
public class MyLevenshteinUtils {
/**
* http://en.wikibooks.org/wiki/Algorithm_Implementation/
* Strings/Levenshtein_distance#Java
*
* @param str1
* @param str2
* @return
*/
public static int computeLevenshteinDistance(CharSequence str1,
CharSequence str2) {
int[][] distance = new int[str1.length() + 1][str2.length() + 1];
for (int i = 0; i <= str1.length(); i++)
distance[i][0] = i;
for (int j = 0; j <= str2.length(); j++)
distance[0][j] = j;
for (int i = 1; i <= str1.length(); i++)
for (int j = 1; j <= str2.length(); j++)
distance[i][j] = minimum(
distance[i - 1][j] + 1,
distance[i][j - 1] + 1,
distance[i - 1][j - 1]
+ ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0
: 1));
return distance[str1.length()][str2.length()];
}
/**
* Levenshtein-Damerau
*
* @param str1
* @param str2
* @return
*/
public static int computeLevenshteinDamerauDistance(CharSequence str1,
CharSequence str2) {
int[][] distance = new int[str1.length() + 1][str2.length() + 1];
for (int i = 0; i <= str1.length(); i++) {
distance[i][0] = i;
}
for (int j = 0; j <= str2.length(); j++) {
distance[0][j] = j;
}
for (int i = 1; i <= str1.length(); i++) {
for (int j = 1; j <= str2.length(); j++) {
int cost = (str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1;
distance[i][j] = minimum(distance[i - 1][j] + 1,
distance[i][j - 1] + 1, distance[i - 1][j - 1] + cost);
if (i > 1 && j > 1 && str1.charAt(i - 1) == str2.charAt(j - 2)
&& str1.charAt(i - 2) == str2.charAt(j - 1)) {
distance[i][j] = Math.min(distance[i][j],
distance[i - 2][j - 2] + cost);
}
}
}
return distance[str1.length()][str2.length()];
}
private static int minimum(int a, int b, int c) {
return Math.min(Math.min(a, b), c);
}
public static Double getCustomScore(String s1, String s2) {
if (StringUtils.isBlank(s1) || StringUtils.isBlank(s2)) {
return 0.0;
}
return 1.0
- (double) LevenshteinUtils.computeLevenshteinDamerauDistance(
s1, s2) / Math.max(s1.length(), s2.length());
}
}
@SuppressWarnings("unchecked")
@Override
protected LevenshteinResultBean handleSearchRefGeoByLevenshtein(
String codePostal, String ville) throws Exception {
// result
RefGeo refGeo = null;
Double score = null;
// utils
StringBuilder qlString = null;
Query query = null;
Session delegate = (Session) emanager.getDelegate();
Double levenshteinScoreMin = (Double) InitialContext
.doLookup("myApp/levenshteinScoreMin");
/*
* Recherche classique par codePostal + ville en ignorant caste et
* accents
*/
qlString = new StringBuilder();
qlString.append("from ")
.append(RefGeo.class.getName())
.append(" g")
.append(" where 1=1 ")
.append(" and codePostal = :codePostal ")
.append(" and upper(convert(ville, 'US7ASCII')) like upper(convert(:ville, 'US7ASCII'))");
query = delegate.createQuery(qlString.toString());
query.setString("codePostal", codePostal);
query.setString("ville", ville);
List<RefGeo> results = (List<RefGeo>) query.list();
if (results != null && results.size() > 0) {
// premier, même si plusieurs résultats
refGeo = results.get(0);
} else {
// non trouvé, recherche meilleure correspondance
qlString = new StringBuilder();
qlString.append("from ").append(RefGeo.class.getName())
.append(" g").append(" where 1=1 ")
.append(" and codePostal = :codePostal ")
.append(" order by g.ville asc");
results = (List<RefGeo>) delegate.createQuery(qlString.toString())
.setString("codePostal", codePostal).list();
if (results != null && results.size() == 1) {
// une seule correspondance (ex : 75001, Paris 1er)
refGeo = results.get(0);
} else if (results != null && results.size() > 0) {
// plusieurs ville pour ce code postal
score = 0.0;
String s1 = MyStringUtils.toUnaccentuatedUpperCase(ville)
.replace("-", " ").replace("'", " ");
String s2;
/* Recherche du meilleur score via algo de Levenshtein */
for (RefGeo g : results) {
s2 = MyStringUtils
.toUnaccentuatedUpperCase(g.getVille())
.replace("-", " ").replace("'", " ");
Double thisScore = MyLevenshteinUtils.getCustomScore(s1, s2);
if (thisScore > levenshteinScoreMin && thisScore > score) {
refGeo = g;
score = thisScore;
}
log.debug(
"Recherché : {0} ({1}), Comparé : {2} ({3}), Score : {4}",
ville, s1, g.getVille(), s2, thisScore);
}
} else {
// code postal inexistant
log.error("Code postal ("
+ codePostal
+ ") non trouvé dans le référentiel. Recherche par ville ("
+ ville + ") sans code postal.");
/* Recherche par ville (tolérant aux majuscules et accents) */
qlString = new StringBuilder();
qlString.append("from ")
.append(RefGeo.class.getName())
.append(" g")
.append(" where 1=1 ")
.append(" and upper(convert(ville, 'US7ASCII')) like upper(convert(:ville, 'US7ASCII'))")
.append(" order by g.codePostal");
query = delegate.createQuery(qlString.toString());
query.setString("ville", ville);
results = (List<RefGeo>) query.list();
if (results != null && results.size() > 0) {
// le premier, par ordre de code postal
refGeo = results.get(0);
} else {
log.error("Ville (" + ville
+ ") non trouvée dans le référentiel");
}
}
}
return new LevenshteinResultBean(refGeo, score);
}
import java.text.Normalizer;
import org.apache.commons.lang.StringUtils;
public class MyStringUtils {
public static String toUnaccentuatedUpperCase(String sIn) {
String sOut = null;
if (StringUtils.isNotBlank(sIn)) {
sOut = Normalizer.normalize(sIn, Normalizer.Form.NFD).replaceAll(
"\\p{InCombiningDiacriticalMarks}+", "");
sOut = StringUtils.upperCase(sOut);
}
return sOut;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment