Created
May 7, 2017 20:33
-
-
Save salekh/0833358ceadeadc30a17e94f7bda7f31 to your computer and use it in GitHub Desktop.
Part of a module that performs Hierarchical Clustering to cluster patents for which we have a high confidence that they are authored by the same person Raw
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package algorithms; | |
import base.PatentCluster; | |
import base.PatstatPatent; | |
import java.util.ArrayList; | |
/** | |
* Created by sanchitalekh on 14/03/2017. | |
*/ | |
public class HierAggloClustering { | |
public HierAggloClustering(ArrayList<PatstatPatent> patents, ArrayList<ArrayList<Double>> simMatrix, double minStoppingSimilarity){ | |
this.patents = patents; | |
this.simMatrix = simMatrix; | |
this.clusterSimilarities = simMatrix; | |
this.minStoppingSimilarity = minStoppingSimilarity; | |
this.clusters = initializeClusters(); | |
} | |
/** | |
* patents: ArrayList of n patents | |
* simMatrix: ArrayList of nxn doubles | |
* clusters: Initially must be initialized to n, one for each patent | |
*/ | |
public ArrayList<PatstatPatent> patents = new ArrayList<PatstatPatent>(); | |
public ArrayList<ArrayList<Double>> simMatrix = new ArrayList<ArrayList<Double>>(); | |
public ArrayList<PatentCluster> clusters = new ArrayList<PatentCluster>(); | |
public ArrayList<ArrayList<Double>> clusterSimilarities = new ArrayList<ArrayList<Double>>(); | |
public double minStoppingSimilarity; | |
public double maxSimilarity; | |
/** | |
* Initializes clusters for Hierarchical Agglomerative Clustering | |
* In Agglomerative Clustering, initially, all items are clusters of their own | |
* @return initial clusters | |
*/ | |
public ArrayList<PatentCluster> initializeClusters(){ | |
ArrayList<PatentCluster> initialClusters = new ArrayList<PatentCluster>(); | |
for(int i=0; i<patents.size(); i++){ | |
PatentCluster p = new PatentCluster(patents.get(i),simMatrix.get(i)); | |
initialClusters.add(p); | |
} | |
System.out.println("HIERARCHICAL CLUSTERING: Clusters initialized"); | |
return initialClusters; | |
} | |
/** | |
* Main clustering subroutine : HERO FUNCTION | |
*/ | |
public void cluster(){ | |
maxSimilarity = findMaxSimilarity().extremeSimilarity; | |
while(maxSimilarity > minStoppingSimilarity){ | |
int highIndex; | |
int lowIndex; | |
int indexI; | |
int indexJ; | |
ReturnExtremeSimilarity res = findMaxSimilarity(); | |
indexI = res.i; | |
indexJ = res.j; | |
maxSimilarity = res.extremeSimilarity; | |
/** | |
* THIS MEANS THAT CLUSTER INDEX indexI and indexJ should be merged | |
* We will merge into the cluster with lower index | |
* We will set cluster with higher index to null | |
*/ | |
if(indexI > indexJ){ | |
highIndex = indexI; | |
lowIndex = indexJ; | |
} | |
else{ | |
highIndex = indexJ; | |
lowIndex = indexI; | |
} | |
clusters.get(lowIndex).addtoCluster(patents.get(highIndex),simMatrix.get(highIndex)); | |
clusters.set(highIndex,null); | |
removeSimEntriesforAgglomeratedCluster(this.clusterSimilarities,highIndex); | |
//update cluster similarities according to new clusters | |
updateClusterSimilarities(this.clusterSimilarities,lowIndex); | |
System.out.println("Clusters " + Integer.toString(lowIndex+1) + " and " + Integer.toString(highIndex+1) + " merged!"); | |
maxSimilarity = findMaxSimilarity().extremeSimilarity; | |
} | |
} | |
public ArrayList<PatentCluster> getClusters(){ | |
return this.clusters; | |
} | |
public void updateClusterSimilarities(ArrayList<ArrayList<Double>> clusterSimilarities, int lowIndex){ | |
int indexI = lowIndex; | |
for(int indexJ = 0; indexJ < clusterSimilarities.size(); indexJ++){ | |
double maxMemSim = 0.0; | |
if(clusterSimilarities.get(indexI).get(indexJ) == -1.0) continue; | |
else if(indexI == indexJ) continue; | |
else { | |
ArrayList<ArrayList<Double>> clusterMemSims = clusters.get(indexI).getMemberSimilarities(); | |
for (int i=0; i<clusterMemSims.size(); i++){ | |
if(clusterMemSims.get(i).get(indexJ) > maxMemSim){ | |
maxMemSim = clusterMemSims.get(i).get(indexJ); | |
} | |
} | |
} | |
clusterSimilarities.get(indexI).set(indexJ,maxMemSim); | |
clusterSimilarities.get(indexJ).set(indexI,maxMemSim); | |
} | |
} | |
public void removeSimEntriesforAgglomeratedCluster(ArrayList<ArrayList<Double>> clusterSims, int index){ | |
for(int i=0; i< clusterSims.size(); i++){ | |
int j = index; | |
if(i==j) continue; | |
else{ | |
clusterSims.get(i).set(j,-1.0); | |
clusterSims.get(j).set(i,-1.0); | |
} | |
} | |
} | |
/** | |
* finds patent pairs with the maximum similarity | |
* @return class ReturnExtremeSimilarity variable with maximum similarity | |
*/ | |
public ReturnExtremeSimilarity findMaxSimilarity(){ | |
double max = 0.0; | |
int maxI = 0; | |
int maxJ = 0; | |
for (int i=0; i<clusterSimilarities.size(); i++){ | |
for (int j=0; j<=i; j++){ | |
if(i==j) continue; | |
else{ | |
double temp = clusterSimilarities.get(i).get(j); | |
if(temp > max){ | |
max = temp; | |
maxI = i; | |
maxJ = j; | |
} | |
} | |
} | |
} | |
ReturnExtremeSimilarity returnExtremeSimilarity = new ReturnExtremeSimilarity(max,maxI,maxJ); | |
return returnExtremeSimilarity; | |
} | |
/** | |
* finds patent pairs with the minimum similarity | |
* @return class ReturnExtremeSimilarity variable with minimum similarity | |
*/ | |
public ReturnExtremeSimilarity findMinSimilarity(){ | |
double min = 99.0; | |
int minI = 0; | |
int minJ = 0; | |
for (int i=0; i<clusterSimilarities.size(); i++){ | |
for (int j=0; j<=i; j++){ | |
if(i==j) continue; | |
else{ | |
double temp = clusterSimilarities.get(i).get(j); | |
if(temp < min){ | |
min = temp; | |
minI = i; | |
minJ = j; | |
} | |
} | |
} | |
} | |
ReturnExtremeSimilarity returnExtremeSimilarity = new ReturnExtremeSimilarity(min,minI,minJ); | |
return returnExtremeSimilarity; | |
} | |
public class ReturnExtremeSimilarity { | |
public ReturnExtremeSimilarity(double extremeSimilarity, int i, int j){ | |
this.extremeSimilarity = extremeSimilarity; | |
this.i = i; | |
this.j = j; | |
} | |
public double extremeSimilarity; | |
public int i; | |
public int j; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment