Skip to content

Instantly share code, notes, and snippets.

@salekh
Created May 7, 2017 20:33
Show Gist options
  • Save salekh/0833358ceadeadc30a17e94f7bda7f31 to your computer and use it in GitHub Desktop.
Save salekh/0833358ceadeadc30a17e94f7bda7f31 to your computer and use it in GitHub Desktop.
Part of a module that performs Hierarchical Clustering to cluster patents for which we have a high confidence that they are authored by the same person Raw
package algorithms;
import base.PatentCluster;
import base.PatstatPatent;
import java.util.ArrayList;
/**
* Created by sanchitalekh on 14/03/2017.
*/
public class HierAggloClustering {
public HierAggloClustering(ArrayList<PatstatPatent> patents, ArrayList<ArrayList<Double>> simMatrix, double minStoppingSimilarity){
this.patents = patents;
this.simMatrix = simMatrix;
this.clusterSimilarities = simMatrix;
this.minStoppingSimilarity = minStoppingSimilarity;
this.clusters = initializeClusters();
}
/**
* patents: ArrayList of n patents
* simMatrix: ArrayList of nxn doubles
* clusters: Initially must be initialized to n, one for each patent
*/
public ArrayList<PatstatPatent> patents = new ArrayList<PatstatPatent>();
public ArrayList<ArrayList<Double>> simMatrix = new ArrayList<ArrayList<Double>>();
public ArrayList<PatentCluster> clusters = new ArrayList<PatentCluster>();
public ArrayList<ArrayList<Double>> clusterSimilarities = new ArrayList<ArrayList<Double>>();
public double minStoppingSimilarity;
public double maxSimilarity;
/**
* Initializes clusters for Hierarchical Agglomerative Clustering
* In Agglomerative Clustering, initially, all items are clusters of their own
* @return initial clusters
*/
public ArrayList<PatentCluster> initializeClusters(){
ArrayList<PatentCluster> initialClusters = new ArrayList<PatentCluster>();
for(int i=0; i<patents.size(); i++){
PatentCluster p = new PatentCluster(patents.get(i),simMatrix.get(i));
initialClusters.add(p);
}
System.out.println("HIERARCHICAL CLUSTERING: Clusters initialized");
return initialClusters;
}
/**
* Main clustering subroutine : HERO FUNCTION
*/
public void cluster(){
maxSimilarity = findMaxSimilarity().extremeSimilarity;
while(maxSimilarity > minStoppingSimilarity){
int highIndex;
int lowIndex;
int indexI;
int indexJ;
ReturnExtremeSimilarity res = findMaxSimilarity();
indexI = res.i;
indexJ = res.j;
maxSimilarity = res.extremeSimilarity;
/**
* THIS MEANS THAT CLUSTER INDEX indexI and indexJ should be merged
* We will merge into the cluster with lower index
* We will set cluster with higher index to null
*/
if(indexI > indexJ){
highIndex = indexI;
lowIndex = indexJ;
}
else{
highIndex = indexJ;
lowIndex = indexI;
}
clusters.get(lowIndex).addtoCluster(patents.get(highIndex),simMatrix.get(highIndex));
clusters.set(highIndex,null);
removeSimEntriesforAgglomeratedCluster(this.clusterSimilarities,highIndex);
//update cluster similarities according to new clusters
updateClusterSimilarities(this.clusterSimilarities,lowIndex);
System.out.println("Clusters " + Integer.toString(lowIndex+1) + " and " + Integer.toString(highIndex+1) + " merged!");
maxSimilarity = findMaxSimilarity().extremeSimilarity;
}
}
public ArrayList<PatentCluster> getClusters(){
return this.clusters;
}
public void updateClusterSimilarities(ArrayList<ArrayList<Double>> clusterSimilarities, int lowIndex){
int indexI = lowIndex;
for(int indexJ = 0; indexJ < clusterSimilarities.size(); indexJ++){
double maxMemSim = 0.0;
if(clusterSimilarities.get(indexI).get(indexJ) == -1.0) continue;
else if(indexI == indexJ) continue;
else {
ArrayList<ArrayList<Double>> clusterMemSims = clusters.get(indexI).getMemberSimilarities();
for (int i=0; i<clusterMemSims.size(); i++){
if(clusterMemSims.get(i).get(indexJ) > maxMemSim){
maxMemSim = clusterMemSims.get(i).get(indexJ);
}
}
}
clusterSimilarities.get(indexI).set(indexJ,maxMemSim);
clusterSimilarities.get(indexJ).set(indexI,maxMemSim);
}
}
public void removeSimEntriesforAgglomeratedCluster(ArrayList<ArrayList<Double>> clusterSims, int index){
for(int i=0; i< clusterSims.size(); i++){
int j = index;
if(i==j) continue;
else{
clusterSims.get(i).set(j,-1.0);
clusterSims.get(j).set(i,-1.0);
}
}
}
/**
* finds patent pairs with the maximum similarity
* @return class ReturnExtremeSimilarity variable with maximum similarity
*/
public ReturnExtremeSimilarity findMaxSimilarity(){
double max = 0.0;
int maxI = 0;
int maxJ = 0;
for (int i=0; i<clusterSimilarities.size(); i++){
for (int j=0; j<=i; j++){
if(i==j) continue;
else{
double temp = clusterSimilarities.get(i).get(j);
if(temp > max){
max = temp;
maxI = i;
maxJ = j;
}
}
}
}
ReturnExtremeSimilarity returnExtremeSimilarity = new ReturnExtremeSimilarity(max,maxI,maxJ);
return returnExtremeSimilarity;
}
/**
* finds patent pairs with the minimum similarity
* @return class ReturnExtremeSimilarity variable with minimum similarity
*/
public ReturnExtremeSimilarity findMinSimilarity(){
double min = 99.0;
int minI = 0;
int minJ = 0;
for (int i=0; i<clusterSimilarities.size(); i++){
for (int j=0; j<=i; j++){
if(i==j) continue;
else{
double temp = clusterSimilarities.get(i).get(j);
if(temp < min){
min = temp;
minI = i;
minJ = j;
}
}
}
}
ReturnExtremeSimilarity returnExtremeSimilarity = new ReturnExtremeSimilarity(min,minI,minJ);
return returnExtremeSimilarity;
}
public class ReturnExtremeSimilarity {
public ReturnExtremeSimilarity(double extremeSimilarity, int i, int j){
this.extremeSimilarity = extremeSimilarity;
this.i = i;
this.j = j;
}
public double extremeSimilarity;
public int i;
public int j;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment