Created
February 14, 2022 03:50
-
-
Save Enoch2090/5d1d3437ed749aa9ece5ffa692e78bd3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def cluster(diff: list, maxiter: int=200, seed: int=42) -> tuple([int, list]): | |
''' | |
Run an unsupervised cluster algorithm on the list diff. | |
Assumes the list can be divided by a clear threshold, | |
return that threshold using the mean of the two clusters' centers. | |
Arguments: | |
- diff (list): The list of hash differences. | |
- maxiter (int): Max number of K-means iteration. | |
- seed (int): Seed to initialize numpy's RNG. | |
Returns: | |
int - The calculated threshold | |
list - The centroids | |
''' | |
diff = np.array(diff) | |
distances = np.zeros((diff.shape[0], 2)) | |
# initialize center of clusters | |
np.random.seed(seed) | |
np.random.shuffle(diff) | |
centroids = diff[0:2] | |
for i in range(maxiter): | |
centroids_temp = centroids.copy() | |
distances[:, 0] = np.abs(diff - centroids[0]) | |
distances[:, 1] = np.abs(diff - centroids[1]) | |
classes = np.argmin(distances, axis=1) | |
centroids[0] = epsilon * np.mean(diff[classes == 0]) + (1 - epsilon) * centroids[0] | |
centroids[1] = epsilon * np.mean(diff[classes == 1]) + (1 - epsilon) * centroids[1] | |
return (np.mean(centroids), centroids.tolist()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment