Skip to content

Instantly share code, notes, and snippets.

@tsalo
Last active February 12, 2022 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsalo/2f3e28b1608c53e9f171d008f545b447 to your computer and use it in GitHub Desktop.
Save tsalo/2f3e28b1608c53e9f171d008f545b447 to your computer and use it in GitHub Desktop.
An ongoing effort to profile certain steps in NiMARE, including kernel transformation and meta-analysis estimation
"""A small job to profile kernel transformers and meta estimators in NiMARE."""
import logging
import os.path as op
import psutil
from datetime import datetime
import numpy as np
import pandas as pd
from memory_profiler import memory_usage
from nimare.dataset import Dataset
from nimare.meta import MKDADensity
from nimare.meta.kernel import MKDAKernel
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict if "nimare" in name]
for logger in loggers:
logger.setLevel(logging.DEBUG)
LGR = logging.getLogger("script")
# For tracking open files
USERNAME = "tsalo006"
proc = psutil.Process()
full_dset = Dataset.load("/home/tsalo006/neurosynth_dataset.pkl.gz")
full_dset.update_path(op.abspath("ns_data"))
total_study_count = len(full_dset.ids)
df = pd.DataFrame(
columns=[
"n_studies",
"iteration",
"MKDAKernel+Dataset max memory",
"MKDAKernel+Dataset min memory",
"MKDAKernel+Dataset duration",
"MKDAKernel+array max memory",
"MKDAKernel+array min memory",
"MKDAKernel+array duration",
"MKDADensity max memory",
"MKDADensity min memory",
"MKDADensity duration",
"MKDADensity+PregeneratedMA max memory",
"MKDADensity+PregeneratedMA min memory",
"MKDADensity+PregeneratedMA duration",
"MKDADensity+MemoryLimit max memory",
"MKDADensity+MemoryLimit min memory",
"MKDADensity+MemoryLimit duration",
"MKDADensity+PregeneratedMA+MemoryLimit max memory",
"MKDADensity+PregeneratedMA+MemoryLimit min memory",
"MKDADensity+PregeneratedMA+MemoryLimit duration",
]
)
row_counter = 0
skip_pregen = False
for i_subset, subset_count in enumerate(np.arange(100, total_study_count // 2, 10)):
print(f"Running {subset_count}")
for j_iter in range(5):
print("\t" + f"Iteration {j_iter}")
df.loc[row_counter, "n_studies"] = subset_count
df.loc[row_counter, "iteration"] = j_iter
# Subset the Dataset
selected_ids = np.random.RandomState(seed=j_iter).choice(
full_dset.ids, size=subset_count, replace=False
)
red_dset = full_dset.slice(selected_ids)
# Make MA maps and return array
print("\t\tMake MA maps and return array", flush=True)
LGR.info("Make MA maps and return array.")
kern = MKDAKernel()
start = datetime.now()
mem = memory_usage(
(
kern.transform,
(red_dset,),
{"return_type": "array"},
)
)
duration = (datetime.now() - start).total_seconds()
df.loc[row_counter, "MKDAKernel+array max memory"] = max(mem)
df.loc[row_counter, "MKDAKernel+array min memory"] = min(mem)
df.loc[row_counter, "MKDAKernel+array duration"] = duration
del start, mem, duration
del kern
# Make MA maps and return Dataset
print("\t\tMake MA maps and return Dataset", flush=True)
LGR.info("Make MA maps and return Dataset.")
kern = MKDAKernel()
start = datetime.now()
mem = memory_usage(
(
kern.transform,
(red_dset,),
{"return_type": "dataset"},
)
)
duration = (datetime.now() - start).total_seconds()
df.loc[row_counter, "MKDAKernel+Dataset max memory"] = max(mem)
df.loc[row_counter, "MKDAKernel+Dataset min memory"] = min(mem)
df.loc[row_counter, "MKDAKernel+Dataset duration"] = duration
del start, mem, duration
del kern
# Standard meta-analysis without pre-generated MA maps
print("\t\tStandard meta-analysis without pre-generated MA maps", flush=True)
LGR.info("Standard meta-analysis without pre-generated MA maps.")
meta = MKDADensity()
start = datetime.now()
mem = memory_usage((meta.fit, (red_dset,)))
duration = (datetime.now() - start).total_seconds()
df.loc[row_counter, "MKDADensity max memory"] = max(mem)
df.loc[row_counter, "MKDADensity min memory"] = min(mem)
df.loc[row_counter, "MKDADensity duration"] = duration
del start, mem, duration
del meta
# Without pre-generated MA maps, but with a memory limit of 500MB
print("\t\tWithout pre-generated MA maps, but with memory limit", flush=True)
LGR.info("Without pre-generated MA maps, but with memory limit.")
meta = MKDADensity(memory_limit="500mb")
start = datetime.now()
mem = memory_usage((meta.fit, (red_dset,)))
duration = (datetime.now() - start).total_seconds()
df.loc[row_counter, "MKDADensity+MemoryLimit max memory"] = max(mem)
df.loc[row_counter, "MKDADensity+MemoryLimit min memory"] = min(mem)
df.loc[row_counter, "MKDADensity+MemoryLimit duration"] = duration
del start, mem, duration
del meta
# Now actually make the updated Dataset to be used
kern = MKDAKernel()
red_dset_ma = kern.transform(red_dset, return_type="dataset")
# With generated MA maps
if not skip_pregen:
print("\t\tWith generated MA maps.", flush=True)
LGR.info("With generated MA maps.")
meta = MKDADensity()
start = datetime.now()
mem = memory_usage((meta.fit, (red_dset_ma,)))
duration = (datetime.now() - start).total_seconds()
df.loc[row_counter, "MKDADensity+PregeneratedMA max memory"] = max(mem)
df.loc[row_counter, "MKDADensity+PregeneratedMA min memory"] = min(mem)
df.loc[row_counter, "MKDADensity+PregeneratedMA duration"] = duration
# A small check for too much memory usage
# Basically, skip this meta-analysis if it uses more than 20GB memory
if max(mem) > 20000:
skip_pregen = True
del start, mem, duration
del meta
# With generated MA maps and a memory limit of 500MB
print("\t\tWith generated MA maps and memory limit", flush=True)
LGR.info("With generated MA maps and memory limit.")
meta = MKDADensity(memory_limit="500mb")
start = datetime.now()
mem = memory_usage((meta.fit, (red_dset_ma,)))
duration = (datetime.now() - start).total_seconds()
df.loc[row_counter, "MKDADensity+PregeneratedMA+MemoryLimit max memory"] = max(mem)
df.loc[row_counter, "MKDADensity+PregeneratedMA+MemoryLimit min memory"] = min(mem)
df.loc[row_counter, "MKDADensity+PregeneratedMA+MemoryLimit duration"] = duration
del start, mem, duration
del meta
row_counter += 1
df.to_csv("results.tsv", sep="\t", index=False)
my_procs = [proc for proc in psutil.process_iter() if proc.username() == USERNAME]
my_procs = [proc for proc in my_procs if "python" in proc.name()]
open_files = []
for proc in my_procs:
my_files = proc.open_files()
my_files = [f for f in my_files if not f.path.endswith(".err")]
my_files = [f for f in my_files if not f.path.endswith(".out")]
my_files = [f for f in my_files if "passwd" not in f.path]
open_files += my_files
open_files = [str(of) for of in open_files]
if len(open_files):
open_files_str = "\n" + "\n".join(open_files)
print(f"Open files: {open_files_str}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment