Created
August 26, 2019 17:35
-
-
Save jackbergus/0e944b1b62155a655cc4d97df9b09d6c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* HierarchicalDistance.java | |
* This file is part of HierarchicalDistance | |
* | |
* Copyright (C) 2019 - Giacomo Bergami | |
* | |
* HierarchicalDistance is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2 of the License, or | |
* (at your option) any later version. | |
* | |
* HierarchicalDistance is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with HierarchicalDistance. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
package it.giacomobergami.jingoloba.poc; | |
import java.util.*; | |
/** | |
* Given two vectors representing a path into a hierarchical tree, the class defines a similarity | |
* score `distance` determining the similarity of the two terms. For each vector, a threshold value | |
* `thresholdValue`: if the distance is greater than the threshold, then the two vectors are unrelated. | |
* | |
* The int[] hierarchical vectors should be represented accordingly to the ones in the following paper: | |
* - A. Petermann, G. Micale, G. Bergami, A. Pulvirenti, E. Rahm. “Mining and Ranking of Generalized Multi-Dimensional Frequent Subgraphs”. Proceedings of the Twelfth International Conference on Digital Information Management, Fukuoka, Japan, September 2017 (ICDIM 2017). | |
*/ | |
public class HierarchicalDistance { | |
/** | |
* Given a hierarchical tree extracted from a DAG, its maximum assocated branching factor | |
*/ | |
private double maximumBranchingFactor; | |
/** | |
* Maximum distance represented by the distance between the root (empty vector) and one of the nodes at the fist level | |
*/ | |
private double distanceFactor; | |
/** | |
* Factor reducing the distance between the one node from the i-th level to the children at the (i+1)-level greater | |
* at each level step, such that the nodes from different branches will not be overlapped. | |
*/ | |
private double decayFactor; | |
public HierarchicalDistance(int maximumBranchingFactor, double distanceFactor, int decayFactor) { | |
this.maximumBranchingFactor = maximumBranchingFactor; | |
this.distanceFactor = distanceFactor; | |
this.decayFactor = decayFactor; | |
} | |
/** | |
* Returns a vector that can be used to compute the euclidean distance from the hierarchical vector | |
* @param hierarchyVector Vector expressed as in the Petermann paper | |
* @return | |
*/ | |
public double[] fromHierarchyVectorToEuclideanVector(int[] hierarchyVector) { | |
double[] d = new double[(int)maximumBranchingFactor]; | |
double currentDecay = 1.0; | |
for (int i = 0, hierarchyVectorLength = hierarchyVector.length; i < hierarchyVectorLength; i++) { | |
int vi = hierarchyVector[i] - 1; | |
d[vi] = d[vi] + (distanceFactor / currentDecay); | |
currentDecay *= decayFactor; | |
} | |
return d; | |
} | |
/** | |
* Distance threshold values for two given elements | |
* @param left | |
* @param right | |
* @return | |
*/ | |
public double thresholdValue(int[] left, int[] right) { | |
int h2 = left.length, h1 = right.length; | |
if (h2 < h1) { | |
int tmp = h2; | |
h2 = h1; | |
h1 = tmp; | |
} | |
return distanceFactor * (Math.pow(decayFactor, h2-h1+1) - 1.0) / (Math.pow(decayFactor, h2) * (decayFactor - 1.0)); | |
} | |
/** | |
* @param left Hierarchy vector | |
* @param right Hierarchy vector | |
* @return Distance | |
*/ | |
public double distance(int[] left, int[] right) { | |
return euclideanDistance(fromHierarchyVectorToEuclideanVector(left), fromHierarchyVectorToEuclideanVector(right)); | |
} | |
/** | |
* This distance is defined as follows: isConsistent(left, right) <=> subArrayOf(left,right) || subArrayOf(right,left) | |
* | |
* @param left path index vector | |
* @param right path index vector | |
* @return Whether the two elements are consistent or not. | |
*/ | |
public boolean isConsistent(int[] left, int []right) { | |
return distance(left, right) <= thresholdValue(left, right); | |
} | |
/** | |
* Implements the euclidean distance between two coordinate vectors | |
* @param left Left coordinate vector | |
* @param right Right coordinate vector | |
* @return Distance | |
*/ | |
public double euclideanDistance(double[] left, double[] right) { | |
double sum = 0.0; | |
int M = left.length; | |
int N = right.length; | |
for (int i = 0, n = Integer.max(M, N); i<n; i++) { | |
sum += Math.pow((i<M ? left[i] : 0.0)- (i<N ? right[i] : 0.0), 2); | |
} | |
return Math.sqrt(sum); | |
} | |
public static void generateCompleteSubgraph(List<int[]> finalList, int currentHeight, int maximumBranching, int[] currentArray) { | |
if (currentHeight > 0) { | |
for (int i = 1; i<=maximumBranching; i++) { | |
int[] current = new int[currentArray.length+1]; | |
System.arraycopy(currentArray, 0, current, 0, currentArray.length); | |
current[currentArray.length] = i; | |
finalList.add(current); | |
generateCompleteSubgraph(finalList, currentHeight-1, maximumBranching, current); | |
} | |
} | |
} | |
public static List<int[]> generateCompleteSubgraph(int maximumBranchingFactor, int maximumHeight) { | |
List<int[]> result = new ArrayList<>(); | |
if (maximumBranchingFactor > 0) { | |
for (int i = 1; i<=maximumBranchingFactor; i++) { | |
int[] current = new int[1]; | |
current[0] = i; | |
result.add(current); | |
generateCompleteSubgraph(result, maximumHeight-1, maximumBranchingFactor, current); | |
} | |
} | |
return result; | |
} | |
public static int indexOfSubList(int[] array, int[] subarray) { | |
int sourceSize = array.length; | |
int targetSize = subarray.length; | |
int maxCandidate = sourceSize - targetSize; | |
nextCand: | |
for (int candidate = 0; candidate <= maxCandidate; candidate++) { | |
for (int i=0, j=candidate; i<targetSize; i++, j++) | |
if (!(subarray[i] == array[j])) | |
continue nextCand; // Element mismatch, try next cand | |
return candidate; // All elements of candidate matched target | |
} | |
return -1; // No candidate matched the target | |
} | |
public static boolean subArrayOf(int[] subarray, int[] array) { | |
return indexOfSubList(array, subarray) == 0; | |
} | |
/** | |
* Empirical test | |
* @param args | |
*/ | |
public static void main(String args[]) { | |
// Initializing the concept | |
int maximumBranchingFactor = Integer.valueOf(args[0]); | |
double distanceFactor = Double.valueOf(args[1]); | |
int decayFactor = Integer.valueOf(args[2]); | |
int maximumHeight = Integer.valueOf(args[3]); | |
HierarchicalDistance hd = new HierarchicalDistance(maximumBranchingFactor, distanceFactor, decayFactor); | |
List<int[]> ls = generateCompleteSubgraph(maximumBranchingFactor, maximumHeight); | |
for (int i = 0, n = ls.size(); i<n; i++) { | |
int[] veci = ls.get(i); | |
for (int j = 0; j<n; j++) { | |
int[] vecj = ls.get(j); | |
boolean consistenceInference = hd.isConsistent(veci, vecj); | |
if ((subArrayOf(veci, vecj)) || (subArrayOf(vecj, veci))) { | |
if (!consistenceInference) | |
System.err.println("ERROR! the algorithm says that is not consistent, while it should be"); | |
} else { | |
if (consistenceInference) { | |
System.err.println("ERROR! the algorithm says that is consistent, while it is not"); | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment