Last active
February 12, 2022 18:08
-
-
Save tsalo/2f3e28b1608c53e9f171d008f545b447 to your computer and use it in GitHub Desktop.
An ongoing effort to profile certain steps in NiMARE, including kernel transformation and meta-analysis estimation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A small job to profile kernel transformers and meta estimators in NiMARE.""" | |
import logging | |
import os.path as op | |
import psutil | |
from datetime import datetime | |
import numpy as np | |
import pandas as pd | |
from memory_profiler import memory_usage | |
from nimare.dataset import Dataset | |
from nimare.meta import MKDADensity | |
from nimare.meta.kernel import MKDAKernel | |
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict if "nimare" in name] | |
for logger in loggers: | |
logger.setLevel(logging.DEBUG) | |
LGR = logging.getLogger("script") | |
# For tracking open files | |
USERNAME = "tsalo006" | |
proc = psutil.Process() | |
full_dset = Dataset.load("/home/tsalo006/neurosynth_dataset.pkl.gz") | |
full_dset.update_path(op.abspath("ns_data")) | |
total_study_count = len(full_dset.ids) | |
df = pd.DataFrame( | |
columns=[ | |
"n_studies", | |
"iteration", | |
"MKDAKernel+Dataset max memory", | |
"MKDAKernel+Dataset min memory", | |
"MKDAKernel+Dataset duration", | |
"MKDAKernel+array max memory", | |
"MKDAKernel+array min memory", | |
"MKDAKernel+array duration", | |
"MKDADensity max memory", | |
"MKDADensity min memory", | |
"MKDADensity duration", | |
"MKDADensity+PregeneratedMA max memory", | |
"MKDADensity+PregeneratedMA min memory", | |
"MKDADensity+PregeneratedMA duration", | |
"MKDADensity+MemoryLimit max memory", | |
"MKDADensity+MemoryLimit min memory", | |
"MKDADensity+MemoryLimit duration", | |
"MKDADensity+PregeneratedMA+MemoryLimit max memory", | |
"MKDADensity+PregeneratedMA+MemoryLimit min memory", | |
"MKDADensity+PregeneratedMA+MemoryLimit duration", | |
] | |
) | |
row_counter = 0 | |
skip_pregen = False | |
for i_subset, subset_count in enumerate(np.arange(100, total_study_count // 2, 10)): | |
print(f"Running {subset_count}") | |
for j_iter in range(5): | |
print("\t" + f"Iteration {j_iter}") | |
df.loc[row_counter, "n_studies"] = subset_count | |
df.loc[row_counter, "iteration"] = j_iter | |
# Subset the Dataset | |
selected_ids = np.random.RandomState(seed=j_iter).choice( | |
full_dset.ids, size=subset_count, replace=False | |
) | |
red_dset = full_dset.slice(selected_ids) | |
# Make MA maps and return array | |
print("\t\tMake MA maps and return array", flush=True) | |
LGR.info("Make MA maps and return array.") | |
kern = MKDAKernel() | |
start = datetime.now() | |
mem = memory_usage( | |
( | |
kern.transform, | |
(red_dset,), | |
{"return_type": "array"}, | |
) | |
) | |
duration = (datetime.now() - start).total_seconds() | |
df.loc[row_counter, "MKDAKernel+array max memory"] = max(mem) | |
df.loc[row_counter, "MKDAKernel+array min memory"] = min(mem) | |
df.loc[row_counter, "MKDAKernel+array duration"] = duration | |
del start, mem, duration | |
del kern | |
# Make MA maps and return Dataset | |
print("\t\tMake MA maps and return Dataset", flush=True) | |
LGR.info("Make MA maps and return Dataset.") | |
kern = MKDAKernel() | |
start = datetime.now() | |
mem = memory_usage( | |
( | |
kern.transform, | |
(red_dset,), | |
{"return_type": "dataset"}, | |
) | |
) | |
duration = (datetime.now() - start).total_seconds() | |
df.loc[row_counter, "MKDAKernel+Dataset max memory"] = max(mem) | |
df.loc[row_counter, "MKDAKernel+Dataset min memory"] = min(mem) | |
df.loc[row_counter, "MKDAKernel+Dataset duration"] = duration | |
del start, mem, duration | |
del kern | |
# Standard meta-analysis without pre-generated MA maps | |
print("\t\tStandard meta-analysis without pre-generated MA maps", flush=True) | |
LGR.info("Standard meta-analysis without pre-generated MA maps.") | |
meta = MKDADensity() | |
start = datetime.now() | |
mem = memory_usage((meta.fit, (red_dset,))) | |
duration = (datetime.now() - start).total_seconds() | |
df.loc[row_counter, "MKDADensity max memory"] = max(mem) | |
df.loc[row_counter, "MKDADensity min memory"] = min(mem) | |
df.loc[row_counter, "MKDADensity duration"] = duration | |
del start, mem, duration | |
del meta | |
# Without pre-generated MA maps, but with a memory limit of 500MB | |
print("\t\tWithout pre-generated MA maps, but with memory limit", flush=True) | |
LGR.info("Without pre-generated MA maps, but with memory limit.") | |
meta = MKDADensity(memory_limit="500mb") | |
start = datetime.now() | |
mem = memory_usage((meta.fit, (red_dset,))) | |
duration = (datetime.now() - start).total_seconds() | |
df.loc[row_counter, "MKDADensity+MemoryLimit max memory"] = max(mem) | |
df.loc[row_counter, "MKDADensity+MemoryLimit min memory"] = min(mem) | |
df.loc[row_counter, "MKDADensity+MemoryLimit duration"] = duration | |
del start, mem, duration | |
del meta | |
# Now actually make the updated Dataset to be used | |
kern = MKDAKernel() | |
red_dset_ma = kern.transform(red_dset, return_type="dataset") | |
# With generated MA maps | |
if not skip_pregen: | |
print("\t\tWith generated MA maps.", flush=True) | |
LGR.info("With generated MA maps.") | |
meta = MKDADensity() | |
start = datetime.now() | |
mem = memory_usage((meta.fit, (red_dset_ma,))) | |
duration = (datetime.now() - start).total_seconds() | |
df.loc[row_counter, "MKDADensity+PregeneratedMA max memory"] = max(mem) | |
df.loc[row_counter, "MKDADensity+PregeneratedMA min memory"] = min(mem) | |
df.loc[row_counter, "MKDADensity+PregeneratedMA duration"] = duration | |
# A small check for too much memory usage | |
# Basically, skip this meta-analysis if it uses more than 20GB memory | |
if max(mem) > 20000: | |
skip_pregen = True | |
del start, mem, duration | |
del meta | |
# With generated MA maps and a memory limit of 500MB | |
print("\t\tWith generated MA maps and memory limit", flush=True) | |
LGR.info("With generated MA maps and memory limit.") | |
meta = MKDADensity(memory_limit="500mb") | |
start = datetime.now() | |
mem = memory_usage((meta.fit, (red_dset_ma,))) | |
duration = (datetime.now() - start).total_seconds() | |
df.loc[row_counter, "MKDADensity+PregeneratedMA+MemoryLimit max memory"] = max(mem) | |
df.loc[row_counter, "MKDADensity+PregeneratedMA+MemoryLimit min memory"] = min(mem) | |
df.loc[row_counter, "MKDADensity+PregeneratedMA+MemoryLimit duration"] = duration | |
del start, mem, duration | |
del meta | |
row_counter += 1 | |
df.to_csv("results.tsv", sep="\t", index=False) | |
my_procs = [proc for proc in psutil.process_iter() if proc.username() == USERNAME] | |
my_procs = [proc for proc in my_procs if "python" in proc.name()] | |
open_files = [] | |
for proc in my_procs: | |
my_files = proc.open_files() | |
my_files = [f for f in my_files if not f.path.endswith(".err")] | |
my_files = [f for f in my_files if not f.path.endswith(".out")] | |
my_files = [f for f in my_files if "passwd" not in f.path] | |
open_files += my_files | |
open_files = [str(of) for of in open_files] | |
if len(open_files): | |
open_files_str = "\n" + "\n".join(open_files) | |
print(f"Open files: {open_files_str}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment