Created
April 21, 2017 00:03
-
-
Save binzram/df94eafaa3b3e8ca07734ec30e5df58d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package ch.bbc.wikicompare.algorythms; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.Set; | |
public class DiceCoefficent { | |
public double diceCoefficient(String s1, String s2) | |
{ | |
Set<String> nx = new HashSet<String>(); | |
Set<String> ny = new HashSet<String>(); | |
for (int i=0; i < s1.length()-1; i++) { | |
char x1 = s1.charAt(i); | |
char x2 = s1.charAt(i+1); | |
String tmp = "" + x1 + x2; | |
nx.add(tmp); | |
} | |
for (int j=0; j < s2.length()-1; j++) { | |
char y1 = s2.charAt(j); | |
char y2 = s2.charAt(j+1); | |
String tmp = "" + y1 + y2; | |
ny.add(tmp); | |
} | |
Set<String> intersection = new HashSet<String>(nx); | |
intersection.retainAll(ny); | |
double totcombigrams = intersection.size(); | |
return (2*totcombigrams) / (nx.size()+ny.size()); | |
} | |
public double diceCoefficientOptimized(String s, String t) | |
{ | |
// Verifying the input: | |
if (s == null || t == null) | |
return 0; | |
// Quick check to catch identical objects: | |
if (s == t) | |
return 1; | |
// avoid exception for single character searches | |
if (s.length() < 2 || t.length() < 2) | |
return 0; | |
// Create the bigrams for string s: | |
final int n = s.length()-1; | |
final int[] sPairs = new int[n]; | |
for (int i = 0; i <= n; i++) | |
if (i == 0) | |
sPairs[i] = s.charAt(i) << 16; | |
else if (i == n) | |
sPairs[i-1] |= s.charAt(i); | |
else | |
sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16; | |
// Create the bigrams for string t: | |
final int m = t.length()-1; | |
final int[] tPairs = new int[m]; | |
for (int i = 0; i <= m; i++) | |
if (i == 0) | |
tPairs[i] = t.charAt(i) << 16; | |
else if (i == m) | |
tPairs[i-1] |= t.charAt(i); | |
else | |
tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16; | |
// Sort the bigram lists: | |
Arrays.sort(sPairs); | |
Arrays.sort(tPairs); | |
// Count the matches: | |
int matches = 0, i = 0, j = 0; | |
while (i < n && j < m) | |
{ | |
if (sPairs[i] == tPairs[j]) | |
{ | |
matches += 2; | |
i++; | |
j++; | |
} | |
else if (sPairs[i] < tPairs[j]) | |
i++; | |
else | |
j++; | |
} | |
return (double)matches/(n+m); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment