Skip to content

Instantly share code, notes, and snippets.

@yuxuanzhuang
Last active August 20, 2021 08:14
Show Gist options
  • Save yuxuanzhuang/981d0dd29670a5864f99f57213c63a31 to your computer and use it in GitHub Desktop.
Save yuxuanzhuang/981d0dd29670a5864f99f57213c63a31 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RMSD Code Snippet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"pwd = os.getcwd()\n",
"\n",
"import pickle\n",
"import itertools\n",
"import numpy as np\n",
"\n",
"import MDAnalysis as mda\n",
"import MDAnalysis.transformations as trans\n",
"from MDAnalysis.analysis import align, pca, rms\n",
"\n",
"\n",
"import pandas as pd\n",
"import dask\n",
"import dask.dataframe as dd\n",
"\n",
"from dask.distributed import Client, LocalCluster\n",
"\n",
"# for grouping protein chains\n",
"# https://groups.google.com/g/mdnalysis-discussion/c/umDpvbCmQiE/m/iHN3X-IYAwAJ?pli=1\n",
"from grouphug import GroupHug\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create Dask Cluster"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# custom functions\n",
"from dask_cluster_tcbcluster import ip_address, add_workers"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"cluster = LocalCluster(n_workers=24, ip=ip_address,scheduler_port=8785)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"client = Client(cluster)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# add workers to the cluster.\n",
"dask_worker = add_workers(6,\n",
" ip_address=ip_address,\n",
" port=8785)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Universes"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Save the universes to files to decrease memory usage and avoid including large objects in dask graph.\n",
"def load_trajectory(trajectory):\n",
"\n",
" u = mda.Universe(trajectory + '/ca.pdb',\n",
" trajectory + '/md.xtc')\n",
"\n",
" u_bond = mda.Universe(trajectory + '/md.tpr')\n",
" u.add_bonds(u_bond.bonds.to_indices())\n",
"\n",
" u_prot = u.select_atoms('protein')\n",
" prot_chain_list = []\n",
" for chain in u_prot.segments:\n",
" prot_chain_list.append(chain.atoms)\n",
" non_prot = u.select_atoms('not protein')\n",
"\n",
" unwrap_trans = trans.unwrap(u.atoms)\n",
" prot_group = GroupHug(*prot_chain_list)\n",
" center_in_box = trans.center_in_box(u_prot)\n",
" wrap = trans.wrap(non_prot)\n",
" rot_fit_trans = trans.fit_rot_trans(u.select_atoms('name CA'), u.select_atoms('name CA'))\n",
" u.trajectory.add_transformations(*[unwrap_trans, prot_group, center_in_box, wrap, rot_fit_trans])\n",
" # return u\n",
"\n",
" with open(pwd + '/trajectory/' + '_'.join(trajectory.split('/')[-3:-1]) + '.pickle', 'wb') as f:\n",
" pickle.dump(u, f) \n",
" return pwd + '/trajectory/' + '_'.join(trajectory.split('/')[-3:-1]) + '.pickle'"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# list of trajectory path\n",
"trajectory_list = []"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"job_list = []\n",
"for trajectory in trajectory_list:\n",
" job_list.append(dask.delayed(load_trajectory)(trajectory))\n",
"\n",
"trajectory_files = dask.compute(job_list)[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create metadata for analyses"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"md_dataframe = pd.DataFrame(\n",
" columns=list(\n",
" [\"universe\", \"system\", \"MD_name\", \"frame\", \"traj_time\", \"stride\"]\n",
" ))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def append_metadata(universe, system, stride):\n",
" u = pickle.load(open(universe, \"rb\"))\n",
" rep_data = []\n",
"\n",
" md_name = u.trajectory.filename\n",
" timestep = u.trajectory.dt\n",
"\n",
" for i in range(0, u.trajectory.n_frames, stride):\n",
" rep_data.append([universe, system, md_name, i, i * timestep,stride])\n",
" return rep_data"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"job_list = []\n",
"for ind, trajectory in enumerate(trajectory_files):\n",
" job_list.append(dask.delayed(append_metadata)(trajectory, system=ind, stride=100))\n",
" \n",
"meta_data = dask.compute(job_list)[0]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"for i, trajectory in enumerate(trajectory_files):\n",
" md_dataframe = md_dataframe.append(\n",
" pd.DataFrame(meta_data[i], columns=md_dataframe.columns), ignore_index=True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>universe</th>\n",
" <th>system</th>\n",
" <th>MD_name</th>\n",
" <th>frame</th>\n",
" <th>traj_time</th>\n",
" <th>stride</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>0</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>0</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>100</td>\n",
" <td>10000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>0</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>200</td>\n",
" <td>20000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>0</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>300</td>\n",
" <td>30000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>0</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>400</td>\n",
" <td>40000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2670</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>23</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>10600</td>\n",
" <td>1060000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2671</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>23</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>10700</td>\n",
" <td>1070000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2672</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>23</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>10800</td>\n",
" <td>1080000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2673</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>23</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>10900</td>\n",
" <td>1090000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2674</th>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>23</td>\n",
" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
" <td>11000</td>\n",
" <td>1100000.0</td>\n",
" <td>100</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2675 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" universe system \\\n",
"0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
"1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
"2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
"3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
"4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
"... ... ... \n",
"2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
"2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
"2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
"2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
"2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
"\n",
" MD_name frame traj_time \\\n",
"0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 0.0 \n",
"1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 100 10000.0 \n",
"2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 200 20000.0 \n",
"3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 300 30000.0 \n",
"4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 400 40000.0 \n",
"... ... ... ... \n",
"2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10600 1060000.0 \n",
"2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10700 1070000.0 \n",
"2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10800 1080000.0 \n",
"2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10900 1090000.0 \n",
"2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 11000 1100000.0 \n",
"\n",
" stride \n",
"0 100 \n",
"1 100 \n",
"2 100 \n",
"3 100 \n",
"4 100 \n",
"... ... \n",
"2670 100 \n",
"2671 100 \n",
"2672 100 \n",
"2673 100 \n",
"2674 100 \n",
"\n",
"[2675 rows x 6 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# meta_data for all the trajectories and strided frames.\n",
"md_dataframe"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# Turn into a dask.dataframe so dask can partition it into suitable pieces.\n",
"md_ddframe = dd.from_pandas(md_dataframe, npartitions=230)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Analysis"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# It is designed to make sure a partition with two partial trajectories can also work.\n",
"# Can be designed to save analysis results to files to decrease memory usage.\n",
"class DaskChunkMdanalysis(object):\n",
" def __init__(self, **kwargs):\n",
" self.__dict__.update(kwargs)\n",
" \n",
" def __call__(self, df):\n",
" return self.run_df(df)\n",
" \n",
" def run_df(self, df):\n",
" result = []\n",
" for system, df_sys in df.groupby(['system']):\n",
" universe=pickle.load(open(df_sys.universe.iloc[0], \"rb\"))\n",
" start=df_sys.frame.iloc[0]\n",
" stop=df_sys.frame.iloc[-1] + 1\n",
" step=df_sys.stride.iloc[0]\n",
" start, stop, step = universe.trajectory.check_slice_indices(start, stop, step)\n",
"\n",
" result.append(self.run_analysis(universe, start, stop, step))\n",
" return list(itertools.chain.from_iterable(result))\n",
" \n",
" def run_analysis(self, universe, start, stop, step):\n",
" raise NotImplementedError('Only for inheritance.')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# A simple dict to save the final results.\n",
"analysis_result = {}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get RMSD"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# override the run_analysis function.\n",
"class get_rmsd(DaskChunkMdanalysis):\n",
" def run_analysis(self, universe, start, stop, step):\n",
" prot = universe.select_atoms(\"name CA\")\n",
" rms_dens = rms.RMSD(prot, reference=prot, ref_frame=0).run(start, stop, step)\n",
" return rms_dens.rmsd.T[2]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"meta = dict(md_ddframe.dtypes)\n",
"meta[\"RMSD\"] = \"f8\"\n",
"analysis_result['RMSD'] = md_ddframe.map_partitions(lambda df: df.assign(RMSD=get_rmsd()(df)),\n",
" meta=meta).compute()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'RMSD': universe system \\\n",
" 0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
" 1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
" 2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
" 3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
" 4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
" ... ... ... \n",
" 2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
" 2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
" 2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
" 2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
" 2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
" \n",
" MD_name frame traj_time \\\n",
" 0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 0.0 \n",
" 1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 100 10000.0 \n",
" 2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 200 20000.0 \n",
" 3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 300 30000.0 \n",
" 4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 400 40000.0 \n",
" ... ... ... ... \n",
" 2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10600 1060000.0 \n",
" 2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10700 1070000.0 \n",
" 2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10800 1080000.0 \n",
" 2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10900 1090000.0 \n",
" 2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 11000 1100000.0 \n",
" \n",
" stride RMSD \n",
" 0 100 0.000003 \n",
" 1 100 1.489761 \n",
" 2 100 1.548931 \n",
" 3 100 1.703572 \n",
" 4 100 1.789073 \n",
" ... ... ... \n",
" 2670 100 3.172471 \n",
" 2671 100 3.084686 \n",
" 2672 100 2.940567 \n",
" 2673 100 3.239453 \n",
" 2674 100 3.120961 \n",
" \n",
" [2675 rows x 7 columns]}"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"analysis_result"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "mda_py38",
"language": "python",
"name": "jupyter"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment