yuxuanzhuang/RMSD Code Snippet.ipynb

## RMSD Code Snippet.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RMSD Code Snippet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "pwd = os.getcwd()\n",
    "\n",
    "import pickle\n",
    "import itertools\n",
    "import numpy as np\n",
    "\n",
    "import MDAnalysis as mda\n",
    "import MDAnalysis.transformations as trans\n",
    "from MDAnalysis.analysis import align, pca, rms\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "import dask\n",
    "import dask.dataframe as dd\n",
    "\n",
    "from dask.distributed import Client, LocalCluster\n",
    "\n",
    "#  for grouping protein chains\n",
    "#  https://groups.google.com/g/mdnalysis-discussion/c/umDpvbCmQiE/m/iHN3X-IYAwAJ?pli=1\n",
    "from grouphug import GroupHug\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Dask Cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  custom functions\n",
    "from dask_cluster_tcbcluster import ip_address, add_workers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster = LocalCluster(n_workers=24, ip=ip_address,scheduler_port=8785)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = Client(cluster)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add workers to the cluster.\n",
    "dask_worker = add_workers(6,\n",
    "                          ip_address=ip_address,\n",
    "                          port=8785)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Universes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#  Save the universes to files to decrease memory usage and avoid including large objects in dask graph.\n",
    "def load_trajectory(trajectory):\n",
    "\n",
    "    u = mda.Universe(trajectory + '/ca.pdb',\n",
    "                     trajectory + '/md.xtc')\n",
    "\n",
    "    u_bond = mda.Universe(trajectory + '/md.tpr')\n",
    "    u.add_bonds(u_bond.bonds.to_indices())\n",
    "\n",
    "    u_prot = u.select_atoms('protein')\n",
    "    prot_chain_list = []\n",
    "    for chain in u_prot.segments:\n",
    "        prot_chain_list.append(chain.atoms)\n",
    "    non_prot = u.select_atoms('not protein')\n",
    "\n",
    "    unwrap_trans = trans.unwrap(u.atoms)\n",
    "    prot_group = GroupHug(*prot_chain_list)\n",
    "    center_in_box = trans.center_in_box(u_prot)\n",
    "    wrap = trans.wrap(non_prot)\n",
    "    rot_fit_trans = trans.fit_rot_trans(u.select_atoms('name CA'), u.select_atoms('name CA'))\n",
    "    u.trajectory.add_transformations(*[unwrap_trans, prot_group, center_in_box, wrap, rot_fit_trans])\n",
    "    # return u\n",
    "\n",
    "    with open(pwd + '/trajectory/' + '_'.join(trajectory.split('/')[-3:-1]) + '.pickle', 'wb') as f:\n",
    "        pickle.dump(u, f)  \n",
    "    return pwd + '/trajectory/' + '_'.join(trajectory.split('/')[-3:-1]) + '.pickle'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  list of trajectory path\n",
    "trajectory_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "job_list = []\n",
    "for trajectory in trajectory_list:\n",
    "    job_list.append(dask.delayed(load_trajectory)(trajectory))\n",
    "\n",
    "trajectory_files = dask.compute(job_list)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create metadata for analyses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "md_dataframe = pd.DataFrame(\n",
    "        columns=list(\n",
    "            [\"universe\", \"system\", \"MD_name\", \"frame\", \"traj_time\", \"stride\"]\n",
    "        ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def append_metadata(universe, system, stride):\n",
    "    u = pickle.load(open(universe, \"rb\"))\n",
    "    rep_data = []\n",
    "\n",
    "    md_name = u.trajectory.filename\n",
    "    timestep = u.trajectory.dt\n",
    "\n",
    "    for i in range(0, u.trajectory.n_frames, stride):\n",
    "        rep_data.append([universe, system, md_name, i, i * timestep,stride])\n",
    "    return rep_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "job_list = []\n",
    "for ind, trajectory in enumerate(trajectory_files):\n",
    "    job_list.append(dask.delayed(append_metadata)(trajectory, system=ind, stride=100))\n",
    "    \n",
    "meta_data = dask.compute(job_list)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for i, trajectory in enumerate(trajectory_files):\n",
    "    md_dataframe = md_dataframe.append(\n",
    "        pd.DataFrame(meta_data[i], columns=md_dataframe.columns), ignore_index=True\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>universe</th>\n",
       "      <th>system</th>\n",
       "      <th>MD_name</th>\n",
       "      <th>frame</th>\n",
       "      <th>traj_time</th>\n",
       "      <th>stride</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>0</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>0</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>100</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>0</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>200</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>0</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>300</td>\n",
       "      <td>30000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>0</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>400</td>\n",
       "      <td>40000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2670</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>23</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>10600</td>\n",
       "      <td>1060000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2671</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>23</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>10700</td>\n",
       "      <td>1070000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2672</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>23</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>10800</td>\n",
       "      <td>1080000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2673</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>23</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>10900</td>\n",
       "      <td>1090000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2674</th>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>23</td>\n",
       "      <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
       "      <td>11000</td>\n",
       "      <td>1100000.0</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2675 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               universe system  \\\n",
       "0     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       "1     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       "2     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       "3     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       "4     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       "...                                                 ...    ...   \n",
       "2670  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       "2671  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       "2672  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       "2673  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       "2674  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       "\n",
       "                                                MD_name  frame  traj_time  \\\n",
       "0     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0        0.0   \n",
       "1     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    100    10000.0   \n",
       "2     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    200    20000.0   \n",
       "3     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    300    30000.0   \n",
       "4     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    400    40000.0   \n",
       "...                                                 ...    ...        ...   \n",
       "2670  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10600  1060000.0   \n",
       "2671  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10700  1070000.0   \n",
       "2672  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10800  1080000.0   \n",
       "2673  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10900  1090000.0   \n",
       "2674  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  11000  1100000.0   \n",
       "\n",
       "     stride  \n",
       "0       100  \n",
       "1       100  \n",
       "2       100  \n",
       "3       100  \n",
       "4       100  \n",
       "...     ...  \n",
       "2670    100  \n",
       "2671    100  \n",
       "2672    100  \n",
       "2673    100  \n",
       "2674    100  \n",
       "\n",
       "[2675 rows x 6 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  meta_data for all the trajectories and strided frames.\n",
    "md_dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Turn into a dask.dataframe so dask can partition it into suitable pieces.\n",
    "md_ddframe = dd.from_pandas(md_dataframe, npartitions=230)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  It is designed to make sure a partition with two partial trajectories can also work.\n",
    "#  Can be designed to save analysis results to files to decrease memory usage.\n",
    "class DaskChunkMdanalysis(object):\n",
    "    def __init__(self, **kwargs):\n",
    "        self.__dict__.update(kwargs)\n",
    "        \n",
    "    def __call__(self, df):\n",
    "        return self.run_df(df)\n",
    "        \n",
    "    def run_df(self, df):\n",
    "        result = []\n",
    "        for system, df_sys in df.groupby(['system']):\n",
    "            universe=pickle.load(open(df_sys.universe.iloc[0], \"rb\"))\n",
    "            start=df_sys.frame.iloc[0]\n",
    "            stop=df_sys.frame.iloc[-1] + 1\n",
    "            step=df_sys.stride.iloc[0]\n",
    "            start, stop, step = universe.trajectory.check_slice_indices(start, stop, step)\n",
    "\n",
    "            result.append(self.run_analysis(universe, start, stop, step))\n",
    "        return list(itertools.chain.from_iterable(result))\n",
    "                \n",
    "    def run_analysis(self, universe, start, stop, step):\n",
    "        raise NotImplementedError('Only for inheritance.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  A simple dict to save the final results.\n",
    "analysis_result = {}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  Get RMSD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#  override the run_analysis function.\n",
    "class get_rmsd(DaskChunkMdanalysis):\n",
    "    def run_analysis(self, universe, start, stop, step):\n",
    "        prot = universe.select_atoms(\"name CA\")\n",
    "        rms_dens = rms.RMSD(prot, reference=prot, ref_frame=0).run(start, stop, step)\n",
    "        return rms_dens.rmsd.T[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "meta = dict(md_ddframe.dtypes)\n",
    "meta[\"RMSD\"] = \"f8\"\n",
    "analysis_result['RMSD'] = md_ddframe.map_partitions(lambda df: df.assign(RMSD=get_rmsd()(df)),\n",
    "                                                    meta=meta).compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'RMSD':                                                universe system  \\\n",
       " 0     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       " 1     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       " 2     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       " 3     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       " 4     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0   \n",
       " ...                                                 ...    ...   \n",
       " 2670  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       " 2671  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       " 2672  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       " 2673  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       " 2674  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...     23   \n",
       " \n",
       "                                                 MD_name  frame  traj_time  \\\n",
       " 0     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...      0        0.0   \n",
       " 1     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    100    10000.0   \n",
       " 2     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    200    20000.0   \n",
       " 3     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    300    30000.0   \n",
       " 4     /mnt/cephfs/projects/2020100800_alpha7_nachrs/...    400    40000.0   \n",
       " ...                                                 ...    ...        ...   \n",
       " 2670  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10600  1060000.0   \n",
       " 2671  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10700  1070000.0   \n",
       " 2672  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10800  1080000.0   \n",
       " 2673  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  10900  1090000.0   \n",
       " 2674  /mnt/cephfs/projects/2020100800_alpha7_nachrs/...  11000  1100000.0   \n",
       " \n",
       "      stride      RMSD  \n",
       " 0       100  0.000003  \n",
       " 1       100  1.489761  \n",
       " 2       100  1.548931  \n",
       " 3       100  1.703572  \n",
       " 4       100  1.789073  \n",
       " ...     ...       ...  \n",
       " 2670    100  3.172471  \n",
       " 2671    100  3.084686  \n",
       " 2672    100  2.940567  \n",
       " 2673    100  3.239453  \n",
       " 2674    100  3.120961  \n",
       " \n",
       " [2675 rows x 7 columns]}"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis_result"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mda_py38",
   "language": "python",
   "name": "jupyter"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## RMSD Code Snippet"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"import os\n",
	"pwd = os.getcwd()\n",
	"\n",
	"import pickle\n",
	"import itertools\n",
	"import numpy as np\n",
	"\n",
	"import MDAnalysis as mda\n",
	"import MDAnalysis.transformations as trans\n",
	"from MDAnalysis.analysis import align, pca, rms\n",
	"\n",
	"\n",
	"import pandas as pd\n",
	"import dask\n",
	"import dask.dataframe as dd\n",
	"\n",
	"from dask.distributed import Client, LocalCluster\n",
	"\n",
	"# for grouping protein chains\n",
	"# https://groups.google.com/g/mdnalysis-discussion/c/umDpvbCmQiE/m/iHN3X-IYAwAJ?pli=1\n",
	"from grouphug import GroupHug\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create Dask Cluster"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# custom functions\n",
	"from dask_cluster_tcbcluster import ip_address, add_workers"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"cluster = LocalCluster(n_workers=24, ip=ip_address,scheduler_port=8785)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"client = Client(cluster)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# add workers to the cluster.\n",
	"dask_worker = add_workers(6,\n",
	" ip_address=ip_address,\n",
	" port=8785)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"client"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Load Universes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"# Save the universes to files to decrease memory usage and avoid including large objects in dask graph.\n",
	"def load_trajectory(trajectory):\n",
	"\n",
	" u = mda.Universe(trajectory + '/ca.pdb',\n",
	" trajectory + '/md.xtc')\n",
	"\n",
	" u_bond = mda.Universe(trajectory + '/md.tpr')\n",
	" u.add_bonds(u_bond.bonds.to_indices())\n",
	"\n",
	" u_prot = u.select_atoms('protein')\n",
	" prot_chain_list = []\n",
	" for chain in u_prot.segments:\n",
	" prot_chain_list.append(chain.atoms)\n",
	" non_prot = u.select_atoms('not protein')\n",
	"\n",
	" unwrap_trans = trans.unwrap(u.atoms)\n",
	" prot_group = GroupHug(*prot_chain_list)\n",
	" center_in_box = trans.center_in_box(u_prot)\n",
	" wrap = trans.wrap(non_prot)\n",
	" rot_fit_trans = trans.fit_rot_trans(u.select_atoms('name CA'), u.select_atoms('name CA'))\n",
	" u.trajectory.add_transformations(*[unwrap_trans, prot_group, center_in_box, wrap, rot_fit_trans])\n",
	" # return u\n",
	"\n",
	" with open(pwd + '/trajectory/' + '_'.join(trajectory.split('/')[-3:-1]) + '.pickle', 'wb') as f:\n",
	" pickle.dump(u, f) \n",
	" return pwd + '/trajectory/' + '_'.join(trajectory.split('/')[-3:-1]) + '.pickle'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"# list of trajectory path\n",
	"trajectory_list = []"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"job_list = []\n",
	"for trajectory in trajectory_list:\n",
	" job_list.append(dask.delayed(load_trajectory)(trajectory))\n",
	"\n",
	"trajectory_files = dask.compute(job_list)[0]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Create metadata for analyses"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"md_dataframe = pd.DataFrame(\n",
	" columns=list(\n",
	" [\"universe\", \"system\", \"MD_name\", \"frame\", \"traj_time\", \"stride\"]\n",
	" ))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"def append_metadata(universe, system, stride):\n",
	" u = pickle.load(open(universe, \"rb\"))\n",
	" rep_data = []\n",
	"\n",
	" md_name = u.trajectory.filename\n",
	" timestep = u.trajectory.dt\n",
	"\n",
	" for i in range(0, u.trajectory.n_frames, stride):\n",
	" rep_data.append([universe, system, md_name, i, i * timestep,stride])\n",
	" return rep_data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"job_list = []\n",
	"for ind, trajectory in enumerate(trajectory_files):\n",
	" job_list.append(dask.delayed(append_metadata)(trajectory, system=ind, stride=100))\n",
	" \n",
	"meta_data = dask.compute(job_list)[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"for i, trajectory in enumerate(trajectory_files):\n",
	" md_dataframe = md_dataframe.append(\n",
	" pd.DataFrame(meta_data[i], columns=md_dataframe.columns), ignore_index=True\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>universe</th>\n",
	" <th>system</th>\n",
	" <th>MD_name</th>\n",
	" <th>frame</th>\n",
	" <th>traj_time</th>\n",
	" <th>stride</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>0</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>0</td>\n",
	" <td>0.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>0</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>100</td>\n",
	" <td>10000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>0</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>200</td>\n",
	" <td>20000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>0</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>300</td>\n",
	" <td>30000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>0</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>400</td>\n",
	" <td>40000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2670</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>23</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>10600</td>\n",
	" <td>1060000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2671</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>23</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>10700</td>\n",
	" <td>1070000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2672</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>23</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>10800</td>\n",
	" <td>1080000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2673</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>23</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>10900</td>\n",
	" <td>1090000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2674</th>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>23</td>\n",
	" <td>/mnt/cephfs/projects/2020100800_alpha7_nachrs/...</td>\n",
	" <td>11000</td>\n",
	" <td>1100000.0</td>\n",
	" <td>100</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>2675 rows × 6 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" universe system \\\n",
	"0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	"1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	"2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	"3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	"4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	"... ... ... \n",
	"2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	"2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	"2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	"2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	"2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	"\n",
	" MD_name frame traj_time \\\n",
	"0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 0.0 \n",
	"1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 100 10000.0 \n",
	"2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 200 20000.0 \n",
	"3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 300 30000.0 \n",
	"4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 400 40000.0 \n",
	"... ... ... ... \n",
	"2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10600 1060000.0 \n",
	"2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10700 1070000.0 \n",
	"2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10800 1080000.0 \n",
	"2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10900 1090000.0 \n",
	"2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 11000 1100000.0 \n",
	"\n",
	" stride \n",
	"0 100 \n",
	"1 100 \n",
	"2 100 \n",
	"3 100 \n",
	"4 100 \n",
	"... ... \n",
	"2670 100 \n",
	"2671 100 \n",
	"2672 100 \n",
	"2673 100 \n",
	"2674 100 \n",
	"\n",
	"[2675 rows x 6 columns]"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# meta_data for all the trajectories and strided frames.\n",
	"md_dataframe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Turn into a dask.dataframe so dask can partition it into suitable pieces.\n",
	"md_ddframe = dd.from_pandas(md_dataframe, npartitions=230)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Analysis"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"# It is designed to make sure a partition with two partial trajectories can also work.\n",
	"# Can be designed to save analysis results to files to decrease memory usage.\n",
	"class DaskChunkMdanalysis(object):\n",
	" def __init__(self, **kwargs):\n",
	" self.__dict__.update(kwargs)\n",
	" \n",
	" def __call__(self, df):\n",
	" return self.run_df(df)\n",
	" \n",
	" def run_df(self, df):\n",
	" result = []\n",
	" for system, df_sys in df.groupby(['system']):\n",
	" universe=pickle.load(open(df_sys.universe.iloc[0], \"rb\"))\n",
	" start=df_sys.frame.iloc[0]\n",
	" stop=df_sys.frame.iloc[-1] + 1\n",
	" step=df_sys.stride.iloc[0]\n",
	" start, stop, step = universe.trajectory.check_slice_indices(start, stop, step)\n",
	"\n",
	" result.append(self.run_analysis(universe, start, stop, step))\n",
	" return list(itertools.chain.from_iterable(result))\n",
	" \n",
	" def run_analysis(self, universe, start, stop, step):\n",
	" raise NotImplementedError('Only for inheritance.')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [],
	"source": [
	"# A simple dict to save the final results.\n",
	"analysis_result = {}"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Get RMSD"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [],
	"source": [
	"# override the run_analysis function.\n",
	"class get_rmsd(DaskChunkMdanalysis):\n",
	" def run_analysis(self, universe, start, stop, step):\n",
	" prot = universe.select_atoms(\"name CA\")\n",
	" rms_dens = rms.RMSD(prot, reference=prot, ref_frame=0).run(start, stop, step)\n",
	" return rms_dens.rmsd.T[2]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [],
	"source": [
	"meta = dict(md_ddframe.dtypes)\n",
	"meta[\"RMSD\"] = \"f8\"\n",
	"analysis_result['RMSD'] = md_ddframe.map_partitions(lambda df: df.assign(RMSD=get_rmsd()(df)),\n",
	" meta=meta).compute()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'RMSD': universe system \\\n",
	" 0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	" 1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	" 2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	" 3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	" 4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 \n",
	" ... ... ... \n",
	" 2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	" 2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	" 2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	" 2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	" 2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 23 \n",
	" \n",
	" MD_name frame traj_time \\\n",
	" 0 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 0 0.0 \n",
	" 1 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 100 10000.0 \n",
	" 2 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 200 20000.0 \n",
	" 3 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 300 30000.0 \n",
	" 4 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 400 40000.0 \n",
	" ... ... ... ... \n",
	" 2670 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10600 1060000.0 \n",
	" 2671 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10700 1070000.0 \n",
	" 2672 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10800 1080000.0 \n",
	" 2673 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 10900 1090000.0 \n",
	" 2674 /mnt/cephfs/projects/2020100800_alpha7_nachrs/... 11000 1100000.0 \n",
	" \n",
	" stride RMSD \n",
	" 0 100 0.000003 \n",
	" 1 100 1.489761 \n",
	" 2 100 1.548931 \n",
	" 3 100 1.703572 \n",
	" 4 100 1.789073 \n",
	" ... ... ... \n",
	" 2670 100 3.172471 \n",
	" 2671 100 3.084686 \n",
	" 2672 100 2.940567 \n",
	" 2673 100 3.239453 \n",
	" 2674 100 3.120961 \n",
	" \n",
	" [2675 rows x 7 columns]}"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"analysis_result"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "mda_py38",
	"language": "python",
	"name": "jupyter"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}