Skip to content

Instantly share code, notes, and snippets.

@samuelgmartinez
Created October 4, 2012 11:56
Show Gist options
  • Save samuelgmartinez/3833160 to your computer and use it in GitHub Desktop.
Save samuelgmartinez/3833160 to your computer and use it in GitHub Desktop.
Custom Clustering algorithm based on MinHash
/**
*
*/
package es.colbenson.sb.clustering;
import java.util.HashSet;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
/**
* @author Samuel García Martínez <samuelg@colbenson.es>
*
*/
public class CustomClusterizator implements Clusterizator {
private HashFunction function = Hashing.murmur3_128(17239195);
public Set<String> proccessClusterIds(Set<String> set) {
StringBuilder sb = new StringBuilder();
SortedSet<Long> longHashes = new TreeSet<Long>();
for(String str : set) {
longHashes.add(function.hashString(str).asLong());
}
for(Long longHash : longHashes) {
sb.append(longHash);
}
Set<String> buckets = new HashSet<String>(1);
buckets.add(function.hashString(sb.toString()).toString());
return buckets;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment