George GeorgeSeif

## faiss_cluster_gpu.py
k = 10
n_init = 10
max_iter = 300
kmeans = faiss.Kmeans(d=data.shape[1], k=k, niter=max_iter, nredo=n_init, gpu=True)
kmeans.train(data.astype(np.float32))

e = time.time()
print("Training time = {}".format(e - s))


## sklearn_cluster_benchmark_2.py
clf = KMeans(n_clusters=10)

s = time.time()
clf.fit(data)
e = time.time()
print("Training time = {}".format(e - s))


s = time.time()
clf.predict(data)

## generate_cluster_data.py
import numpy as np

data_size = 1000

data = np.random.normal((100, 100, 100), (20, 20, 20), (data_size, 3))

## faiss_cluster_cpu.py
s = time.time()

k = 10
n_init = 10
max_iter = 300
kmeans = faiss.Kmeans(d=x_train.shape[1], k=k, niter=max_iter, nredo=n_init)
kmeans.train(x_train.astype(np.float32))

e = time.time()
print("Training time = {}".format(e - s))

## sklearn_cluster_benchmark.py
clf = KMeans(n_clusters=10)

s = time.time()
clf.fit(x_train)
e = time.time()
print("Training time = {}".format(e - s))


s = time.time()
clf.predict(x_test)

## setup_sklearn_cluster.py
import numpy as np
import time

from sklearn.cluster import KMeans
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(len(x_train), -1).astype(float) / 255.
x_test = x_test.reshape(len(x_test), -1).astype(float) / 255.

## install_faiss_2
# CPU version only
conda install faiss-cpu -c pytorch

# Additional Python packages
pip3 install numpy
pip3 install scikit-learn
pip3 install tensorflow
pip3 install keras

## install_faiss
conda create --name faiss
conda activate faiss

## scikit_learn_2.py
from sklearn.decomposition import LatentDirichletAllocation as LDA

NUM_TOPICS = 3

# Here we create and fit the LDA model
# The "document_word_matrix" is a 2D array where each row is a document
# and each column is a word. The cells contain the count of the word within
# each document
lda = LDA(n_components=NUM_TOPICS, n_jobs=-1)
lda.fit(document_word_matrix)

## scikit_learn_1.py
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tf_idf(vectorizer):
  feature_names = vectorizer.get_feature_names()
  dense_vec = vectors.todense()
  dense_list = dense_vec.tolist()
  tfidf_data = pd.DataFrame(dense_list, columns=feature_names)
  return tfidf_data
	k = 10
	n_init = 10
	max_iter = 300
	kmeans = faiss.Kmeans(d=data.shape[1], k=k, niter=max_iter, nredo=n_init, gpu=True)
	kmeans.train(data.astype(np.float32))

	e = time.time()
	print("Training time = {}".format(e - s))
	clf = KMeans(n_clusters=10)

	s = time.time()
	clf.fit(data)
	e = time.time()
	print("Training time = {}".format(e - s))


	s = time.time()
	clf.predict(data)
	import numpy as np

	data_size = 1000

	data = np.random.normal((100, 100, 100), (20, 20, 20), (data_size, 3))
	clf = KMeans(n_clusters=10)

	s = time.time()
	clf.fit(x_train)
	e = time.time()
	print("Training time = {}".format(e - s))


	s = time.time()
	clf.predict(x_test)
	import numpy as np
	import time

	from sklearn.cluster import KMeans
	from keras.datasets import mnist

	(x_train, y_train), (x_test, y_test) = mnist.load_data()

	x_train = x_train.reshape(len(x_train), -1).astype(float) / 255.
	x_test = x_test.reshape(len(x_test), -1).astype(float) / 255.
	# CPU version only
	conda install faiss-cpu -c pytorch

	# Additional Python packages
	pip3 install numpy
	pip3 install scikit-learn
	pip3 install tensorflow
	pip3 install keras
	from sklearn.decomposition import LatentDirichletAllocation as LDA

	NUM_TOPICS = 3

	# Here we create and fit the LDA model
	# The "document_word_matrix" is a 2D array where each row is a document
	# and each column is a word. The cells contain the count of the word within
	# each document
	lda = LDA(n_components=NUM_TOPICS, n_jobs=-1)
	lda.fit(document_word_matrix)
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer

	def get_tf_idf(vectorizer):
	feature_names = vectorizer.get_feature_names()
	dense_vec = vectors.todense()
	dense_list = dense_vec.tolist()
	tfidf_data = pd.DataFrame(dense_list, columns=feature_names)
	return tfidf_data