ogrisel/bench_pca.py

## bench_pca.py
# %%
from sklearn import set_config
from sklearn.base import clone

set_config(array_api_dispatch=True)

# %%
try:
    import torch
except ModuleNotFoundError:
    torch = None

# %%
try:
    import cupy
except ModuleNotFoundError:
    cupy = None

# %%
try:
    import cuml
except ModuleNotFoundError:
    cuml = None

# %%
from threadpoolctl import threadpool_info
from pprint import pprint

pprint(threadpool_info())

# %%
import numpy as np
import joblib

m = joblib.Memory(location="./joblib")


@m.cache
def make_data(n_samples=int(5e5), n_features=1000, rank=30, dtype=np.float32, seed=0):
    rng = np.random.default_rng(0)
    s = np.diag(np.linspace(100, 0, rank)).astype(dtype)
    A = rng.standard_normal(size=(n_samples, rank)).astype(dtype)
    B = rng.standard_normal(size=(rank, n_features)).astype(dtype)
    return A @ s @ B


X_np = make_data()
print(f"Data shape {X_np.shape} and dtype {X_np.dtype} of size {X_np.nbytes / 1e6:.1f} MB")

# %%
from threadpoolctl import threadpool_limits
from sklearn.decomposition import PCA
from time import perf_counter

k = 5
svd_solver = "full"
if svd_solver == "randomized":
    pca_params = dict(
        n_components=k,
        svd_solver="randomized",
        power_iteration_normalizer="QR",
        random_state=0,
    )
else:
    pca_params = dict(
        n_components=k,
        svd_solver="full",
    )


pca_np = PCA(**pca_params)
print("Running benchmarks for:")
print(pca_np)

tic = perf_counter()
pca_np.fit(X_np)
toc = perf_counter()
print(
    f"Fitting PCA(n_components={k}) with numpy took {toc - tic:.3f}s"
)
# print(pca_np.explained_variance_ratio_)

# for n_threads in [1, 4]:
#     with threadpool_limits(limits=n_threads):
#         tic = perf_counter()
#         pca_np.fit(X_np)
#         toc = perf_counter()
#         print(
#             f"Fitting PCA(n_components={k}) with numpy and {n_threads=} took {toc - tic:.3f}s"
#         )
        # print(pca_np.explained_variance_ratio_)
# %%

if torch is not None:
    X_torch_cpu = torch.tensor(X_np)
    pca_torch_cpu = clone(pca_np)

    tic = perf_counter()
    pca_torch_cpu.fit(X_torch_cpu)
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with torch on CPU took {toc - tic:.3f}s"
    )
    # print(pca_torch_cpu.explained_variance_ratio_)

if torch is not None and torch.cuda.is_available():
    X_torch_gpu = X_torch_cpu.to("cuda")
    pca_torch_gpu = clone(pca_np)
    tic = perf_counter()
    pca_torch_gpu.fit(X_torch_gpu)
    torch.cuda.synchronize()
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with torch on GPU took {toc - tic:.3f}s"
    )
    # print(pca_torch_gpu.explained_variance_ratio_.cpu())

# %%
if cupy is not None:
    X_cupy = cupy.asarray(X_np)
    pca_cupy = clone(pca_np)
    tic = perf_counter()
    pca_cupy.fit(X_cupy)
    cupy.cuda.stream.get_current_stream().synchronize()
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with cupy on GPU took {toc - tic:.3f}s"
    )
    # print(pca_cupy.explained_variance_ratio_)

# %%
if svd_solver == "full" and cuml is not None and cupy is not None:
    from cuml.decomposition import PCA as PCA_cuml
    # cuML's PCA does not support randomized SVD
    pca_cuml = PCA_cuml(**pca_params)
    tic = perf_counter()
    pca_cuml.fit(X_cupy)
    cupy.cuda.stream.get_current_stream().synchronize()
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with cupy with cuML on GPU took {toc - tic:.3f}s"
    )
    # print(pca_cuml.explained_variance_ratio_)
	# %%
	from sklearn import set_config
	from sklearn.base import clone

	set_config(array_api_dispatch=True)

	# %%
	try:
	import torch
	except ModuleNotFoundError:
	torch = None

	# %%
	try:
	import cupy
	except ModuleNotFoundError:
	cupy = None

	# %%
	try:
	import cuml
	except ModuleNotFoundError:
	cuml = None

	# %%
	from threadpoolctl import threadpool_info
	from pprint import pprint

	pprint(threadpool_info())

	# %%
	import numpy as np
	import joblib

	m = joblib.Memory(location="./joblib")


	@m.cache
	def make_data(n_samples=int(5e5), n_features=1000, rank=30, dtype=np.float32, seed=0):
	rng = np.random.default_rng(0)
	s = np.diag(np.linspace(100, 0, rank)).astype(dtype)
	A = rng.standard_normal(size=(n_samples, rank)).astype(dtype)
	B = rng.standard_normal(size=(rank, n_features)).astype(dtype)
	return A @ s @ B


	X_np = make_data()
	print(f"Data shape {X_np.shape} and dtype {X_np.dtype} of size {X_np.nbytes / 1e6:.1f} MB")

	# %%
	from threadpoolctl import threadpool_limits
	from sklearn.decomposition import PCA
	from time import perf_counter

	k = 5
	svd_solver = "full"
	if svd_solver == "randomized":
	pca_params = dict(
	n_components=k,
	svd_solver="randomized",
	power_iteration_normalizer="QR",
	random_state=0,
	)
	else:
	pca_params = dict(
	n_components=k,
	svd_solver="full",
	)


	pca_np = PCA(**pca_params)
	print("Running benchmarks for:")
	print(pca_np)

	tic = perf_counter()
	pca_np.fit(X_np)
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with numpy took {toc - tic:.3f}s"
	)
	# print(pca_np.explained_variance_ratio_)

	# for n_threads in [1, 4]:
	# with threadpool_limits(limits=n_threads):
	# tic = perf_counter()
	# pca_np.fit(X_np)
	# toc = perf_counter()
	# print(
	# f"Fitting PCA(n_components={k}) with numpy and {n_threads=} took {toc - tic:.3f}s"
	# )
	# print(pca_np.explained_variance_ratio_)
	# %%

	if torch is not None:
	X_torch_cpu = torch.tensor(X_np)
	pca_torch_cpu = clone(pca_np)

	tic = perf_counter()
	pca_torch_cpu.fit(X_torch_cpu)
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with torch on CPU took {toc - tic:.3f}s"
	)
	# print(pca_torch_cpu.explained_variance_ratio_)

	if torch is not None and torch.cuda.is_available():
	X_torch_gpu = X_torch_cpu.to("cuda")
	pca_torch_gpu = clone(pca_np)
	tic = perf_counter()
	pca_torch_gpu.fit(X_torch_gpu)
	torch.cuda.synchronize()
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with torch on GPU took {toc - tic:.3f}s"
	)
	# print(pca_torch_gpu.explained_variance_ratio_.cpu())

	# %%
	if cupy is not None:
	X_cupy = cupy.asarray(X_np)
	pca_cupy = clone(pca_np)
	tic = perf_counter()
	pca_cupy.fit(X_cupy)
	cupy.cuda.stream.get_current_stream().synchronize()
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with cupy on GPU took {toc - tic:.3f}s"
	)
	# print(pca_cupy.explained_variance_ratio_)

	# %%
	if svd_solver == "full" and cuml is not None and cupy is not None:
	from cuml.decomposition import PCA as PCA_cuml
	# cuML's PCA does not support randomized SVD
	pca_cuml = PCA_cuml(**pca_params)
	tic = perf_counter()
	pca_cuml.fit(X_cupy)
	cupy.cuda.stream.get_current_stream().synchronize()
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with cupy with cuML on GPU took {toc - tic:.3f}s"
	)
	# print(pca_cuml.explained_variance_ratio_)