dhimmel/hetmech-query-node-pair.ipynb

## hetmech-query-node-pair.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Query connectivity between two nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pathlib\n",
    "import zipfile\n",
    "import collections\n",
    "\n",
    "import numpy\n",
    "import pandas\n",
    "import tqdm\n",
    "import scipy.sparse\n",
    "\n",
    "from hetmech.hetmat import HetMat\n",
    "import hetmech.degree_group\n",
    "import hetmech.pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read degree-grouped permutation archive info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>archive</th>\n",
       "      <th>filename</th>\n",
       "      <th>file_size</th>\n",
       "      <th>compress_type</th>\n",
       "      <th>compress_size</th>\n",
       "      <th>CRC</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>metapath</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AdG</th>\n",
       "      <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
       "      <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
       "      <td>29366</td>\n",
       "      <td>store</td>\n",
       "      <td>29366</td>\n",
       "      <td>1169061893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AeG</th>\n",
       "      <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
       "      <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
       "      <td>321650</td>\n",
       "      <td>store</td>\n",
       "      <td>321650</td>\n",
       "      <td>2872114663</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    archive  \\\n",
       "metapath                                                      \n",
       "AdG       degree-grouped-perms_length-1_damping-0.5-0000...   \n",
       "AeG       degree-grouped-perms_length-1_damping-0.5-0000...   \n",
       "\n",
       "                                                   filename  file_size  \\\n",
       "metapath                                                                 \n",
       "AdG       adjusted-path-counts/dwpc-0.5/degree-grouped-p...      29366   \n",
       "AeG       adjusted-path-counts/dwpc-0.5/degree-grouped-p...     321650   \n",
       "\n",
       "         compress_type  compress_size         CRC  \n",
       "metapath                                           \n",
       "AdG              store          29366  1169061893  \n",
       "AeG              store         321650  2872114663  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read archive locations\n",
    "archive_dir = pathlib.Path('../bulk-pipeline/archives-92f40fe')\n",
    "dfs = list()\n",
    "for length in range(1, 4):\n",
    "    path = archive_dir / f'degree-grouped-perms_length-{length}_damping-0.5.zip-info.tsv'\n",
    "    dfs.append(pandas.read_table(path))\n",
    "dgp_info_df = pandas.concat(dfs)\n",
    "dgp_info_df['metapath'] = dgp_info_df.filename.map(lambda x: x.rsplit('/', 1)[-1].split('.')[0])\n",
    "dgp_info_df.set_index('metapath', inplace=True)\n",
    "metapath_to_dgp_info = dict(dgp_info_df.iterrows())\n",
    "dgp_info_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define functions that will be moved upstream to the hetmech package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dwpc_to_degrees(graph, metapath, damping=0.5, index_pairs=[]):\n",
    "    \"\"\"\n",
    "    Yield a description of each cell in a DWPC matrix adding source and target\n",
    "    node degree info as well as the corresponding path count.\n",
    "    \"\"\"\n",
    "    metapath = graph.metagraph.get_metapath(metapath)\n",
    "    _, _, source_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[0], dense_threshold=0.7)\n",
    "    _, _, target_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[-1], dense_threshold=0.7)\n",
    "    source_degrees = source_adj_mat.sum(axis=1).flat\n",
    "    target_degrees = target_adj_mat.sum(axis=0).flat\n",
    "    del source_adj_mat, target_adj_mat\n",
    "\n",
    "    source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')\n",
    "    source_node_df = pandas.read_table(source_path)\n",
    "    source_node_names = list(source_node_df['name'])\n",
    "\n",
    "    target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')\n",
    "    target_node_df = pandas.read_table(target_path)\n",
    "    target_node_names = list(target_node_df['name'])\n",
    "\n",
    "    row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)\n",
    "    dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())\n",
    "    if scipy.sparse.issparse(dwpc_matrix):\n",
    "        dwpc_matrix = dwpc_matrix.toarray()\n",
    "\n",
    "    _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)\n",
    "    if scipy.sparse.issparse(path_count):\n",
    "        path_count = path_count.toarray()\n",
    "\n",
    "    for row_ind, col_ind in index_pairs:\n",
    "        dwpc_value = dwpc_matrix[row_ind, col_ind]\n",
    "        row = {\n",
    "            'source_id': row_names[row_ind],\n",
    "            'target_id': col_names[col_ind],\n",
    "            'source_name': source_node_names[row_ind],\n",
    "            'target_name': target_node_names[col_ind],\n",
    "            'source_degree': source_degrees[row_ind],\n",
    "            'target_degree': target_degrees[col_ind],\n",
    "            'path_count': path_count[row_ind, col_ind],\n",
    "            'dwpc': dwpc_value,\n",
    "        }\n",
    "        yield collections.OrderedDict(row)\n",
    "\n",
    "\n",
    "def combine_dwpc_dgp(graph, metapath, damping, index_pairs, max_p_value=1.0):\n",
    "    \"\"\"\n",
    "    Combine DWPC information with degree-grouped permutation summary metrics.\n",
    "    Includes gamma-hurdle significance estimates.\n",
    "    \"\"\"\n",
    "    # stats_path = graph.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz')\n",
    "    # dgp_df = pandas.read_table(stats_path)\n",
    "    try:\n",
    "        info = metapath_to_dgp_info[str(metapath)]\n",
    "        inverted = False\n",
    "    except KeyError:\n",
    "        info = metapath_to_dgp_info[str(metapath.inverse)]\n",
    "        inverted = True\n",
    "    path = archive_dir / info.archive\n",
    "    with zipfile.ZipFile(path) as zip_file:\n",
    "        with zip_file.open(info.filename) as read_file:\n",
    "            dgp_df = pandas.read_table(read_file, compression='gzip')\n",
    "    if inverted:\n",
    "        dgp_df = dgp_df.rename(columns={'source_degree': 'target_degree', 'target_degree': 'source_degree'})\n",
    "    dgp_df['mean_nz'] = dgp_df['sum'] / dgp_df['nnz']\n",
    "    dgp_df['sd_nz'] = ((dgp_df['sum_of_squares'] - dgp_df['sum'] ** 2 / dgp_df['nnz']) / (dgp_df['nnz'] - 1)) ** 0.5\n",
    "    dgp_df['beta'] = dgp_df['mean_nz'] / dgp_df['sd_nz'] ** 2\n",
    "    dgp_df['alpha'] = dgp_df['mean_nz'] * dgp_df['beta']\n",
    "    degrees_to_dgp = dgp_df.set_index(['source_degree', 'target_degree']).to_dict(orient='index')\n",
    "    dwpc_row_generator = dwpc_to_degrees(\n",
    "        graph, metapath, damping=damping, index_pairs=index_pairs)\n",
    "    for row in dwpc_row_generator:\n",
    "        degrees = row['source_degree'], row['target_degree']\n",
    "        dgp = degrees_to_dgp[degrees]\n",
    "        row.update(dgp)\n",
    "        if row['path_count'] == 0:\n",
    "            row['p_value'] = 1.0\n",
    "        else:\n",
    "            row['p_value'] = None if row['sum'] == 0 else (\n",
    "                row['nnz'] / row['n'] *\n",
    "                (1 - scipy.special.gammainc(row['alpha'], row['beta'] * row['dwpc']))\n",
    "            )\n",
    "        if row['p_value'] is not None and row['p_value'] > max_p_value:\n",
    "            continue\n",
    "        for key in ['sum', 'sum_of_squares', 'beta', 'alpha']:\n",
    "            del row[key]\n",
    "        yield row\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Specify parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "source_node = 'Gene', 79068 # FTO Gene\n",
    "target_node = 'Disease', 'DOID:9970' # Obesity\n",
    "\n",
    "# set DWPC damping exponent\n",
    "damping = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "252"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metapaths = hetmat.metagraph.extract_metapaths(source_node[0], target_node[0], max_length=3)\n",
    "len(metapaths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12358, 136)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "source_index = hetmat.get_node_identifiers(source_node[0]).index(source_node[1])\n",
    "target_index = hetmat.get_node_identifiers(target_node[0]).index(target_node[1])\n",
    "source_index, target_index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute adjusted DWPCs and p-values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5a05f10944ae42049304f9328979b1f6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=252), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "rows = list()\n",
    "for metapath in tqdm.tqdm_notebook(metapaths):\n",
    "    index_pairs = [(source_index, target_index)]\n",
    "    for row in combine_dwpc_dgp(hetmat, metapath, damping, index_pairs=index_pairs):\n",
    "        row['metapath'] = str(metapath)\n",
    "        rows.append(row)\n",
    "metapath_df = pandas.DataFrame(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_id</th>\n",
       "      <th>target_id</th>\n",
       "      <th>source_name</th>\n",
       "      <th>target_name</th>\n",
       "      <th>source_degree</th>\n",
       "      <th>target_degree</th>\n",
       "      <th>path_count</th>\n",
       "      <th>dwpc</th>\n",
       "      <th>n</th>\n",
       "      <th>nnz</th>\n",
       "      <th>n_perms</th>\n",
       "      <th>mean_nz</th>\n",
       "      <th>sd_nz</th>\n",
       "      <th>p_value</th>\n",
       "      <th>metapath</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>79068</td>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>373</td>\n",
       "      <td>1</td>\n",
       "      <td>5.267578</td>\n",
       "      <td>10100</td>\n",
       "      <td>1740</td>\n",
       "      <td>100</td>\n",
       "      <td>5.267578</td>\n",
       "      <td>6.468376e-08</td>\n",
       "      <td>0.086139</td>\n",
       "      <td>GaD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>79068</td>\n",
       "      <td>DOID:9970</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>2</td>\n",
       "      <td>45</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>106500</td>\n",
       "      <td>1220</td>\n",
       "      <td>100</td>\n",
       "      <td>7.200037</td>\n",
       "      <td>1.727540e-07</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>GdD</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   source_id  target_id source_name target_name  source_degree  target_degree  \\\n",
       "0      79068  DOID:9970         FTO     obesity              6            373   \n",
       "1      79068  DOID:9970         FTO     obesity              2             45   \n",
       "\n",
       "   path_count      dwpc       n   nnz  n_perms   mean_nz         sd_nz  \\\n",
       "0           1  5.267578   10100  1740      100  5.267578  6.468376e-08   \n",
       "1           0  0.000000  106500  1220      100  7.200037  1.727540e-07   \n",
       "\n",
       "    p_value metapath  \n",
       "0  0.086139      GaD  \n",
       "1  1.000000      GdD  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metapath_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>metapath</th>\n",
       "      <th>source_name</th>\n",
       "      <th>target_name</th>\n",
       "      <th>source_degree</th>\n",
       "      <th>target_degree</th>\n",
       "      <th>path_count</th>\n",
       "      <th>dwpc</th>\n",
       "      <th>mean_nz</th>\n",
       "      <th>n</th>\n",
       "      <th>nnz</th>\n",
       "      <th>p_value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>GpBPpGaD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>32</td>\n",
       "      <td>373</td>\n",
       "      <td>435</td>\n",
       "      <td>2.814122</td>\n",
       "      <td>2.100517</td>\n",
       "      <td>14500</td>\n",
       "      <td>14500</td>\n",
       "      <td>4.747076e-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>GeAeGaD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>28</td>\n",
       "      <td>373</td>\n",
       "      <td>6204</td>\n",
       "      <td>2.002286</td>\n",
       "      <td>1.870643</td>\n",
       "      <td>26500</td>\n",
       "      <td>26500</td>\n",
       "      <td>7.739905e-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>GaDaGaD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>373</td>\n",
       "      <td>280</td>\n",
       "      <td>4.283209</td>\n",
       "      <td>3.463896</td>\n",
       "      <td>10100</td>\n",
       "      <td>10100</td>\n",
       "      <td>3.328533e-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>GaDpSpD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>17</td>\n",
       "      <td>25</td>\n",
       "      <td>4.434438</td>\n",
       "      <td>2.443015</td>\n",
       "      <td>50500</td>\n",
       "      <td>50498</td>\n",
       "      <td>1.351195e-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>249</th>\n",
       "      <td>GpPWpGaD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>1</td>\n",
       "      <td>373</td>\n",
       "      <td>2</td>\n",
       "      <td>3.687043</td>\n",
       "      <td>1.467271</td>\n",
       "      <td>107700</td>\n",
       "      <td>100783</td>\n",
       "      <td>7.459857e-04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>GaDrD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>5.138905</td>\n",
       "      <td>3.917056</td>\n",
       "      <td>90900</td>\n",
       "      <td>16242</td>\n",
       "      <td>2.361471e-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>GaDrDrD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>11</td>\n",
       "      <td>4.850720</td>\n",
       "      <td>2.435138</td>\n",
       "      <td>90900</td>\n",
       "      <td>83771</td>\n",
       "      <td>3.827697e-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>GaDlAlD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>33</td>\n",
       "      <td>42</td>\n",
       "      <td>3.744022</td>\n",
       "      <td>2.730794</td>\n",
       "      <td>10100</td>\n",
       "      <td>10100</td>\n",
       "      <td>6.445393e-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>GcGiGdD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>6</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>4.111242</td>\n",
       "      <td>2.415512</td>\n",
       "      <td>57700</td>\n",
       "      <td>14148</td>\n",
       "      <td>1.770697e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>133</th>\n",
       "      <td>GdDpSpD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>5</td>\n",
       "      <td>3.383199</td>\n",
       "      <td>1.939016</td>\n",
       "      <td>532500</td>\n",
       "      <td>504211</td>\n",
       "      <td>2.750328e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>185</th>\n",
       "      <td>GiGuDrD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1.130016</td>\n",
       "      <td>2.068239</td>\n",
       "      <td>1442700</td>\n",
       "      <td>61314</td>\n",
       "      <td>3.331929e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>GeAlDrD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>28</td>\n",
       "      <td>5</td>\n",
       "      <td>18</td>\n",
       "      <td>1.161357</td>\n",
       "      <td>0.720075</td>\n",
       "      <td>238500</td>\n",
       "      <td>238489</td>\n",
       "      <td>4.016257e-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>131</th>\n",
       "      <td>GdDuGdD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>2</td>\n",
       "      <td>45</td>\n",
       "      <td>4</td>\n",
       "      <td>3.838754</td>\n",
       "      <td>2.930357</td>\n",
       "      <td>106500</td>\n",
       "      <td>72110</td>\n",
       "      <td>4.128653e-02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     metapath source_name target_name  source_degree  target_degree  \\\n",
       "57   GpBPpGaD         FTO     obesity             32            373   \n",
       "41    GeAeGaD         FTO     obesity             28            373   \n",
       "108   GaDaGaD         FTO     obesity              6            373   \n",
       "117   GaDpSpD         FTO     obesity              6             17   \n",
       "249  GpPWpGaD         FTO     obesity              1            373   \n",
       "12      GaDrD         FTO     obesity              6              5   \n",
       "107   GaDrDrD         FTO     obesity              6              5   \n",
       "102   GaDlAlD         FTO     obesity              6             33   \n",
       "166   GcGiGdD         FTO     obesity              6             45   \n",
       "133   GdDpSpD         FTO     obesity              2             17   \n",
       "185   GiGuDrD         FTO     obesity              2              5   \n",
       "37    GeAlDrD         FTO     obesity             28              5   \n",
       "131   GdDuGdD         FTO     obesity              2             45   \n",
       "\n",
       "     path_count      dwpc   mean_nz        n     nnz       p_value  \n",
       "57          435  2.814122  2.100517    14500   14500  4.747076e-08  \n",
       "41         6204  2.002286  1.870643    26500   26500  7.739905e-08  \n",
       "108         280  4.283209  3.463896    10100   10100  3.328533e-07  \n",
       "117          25  4.434438  2.443015    50500   50498  1.351195e-04  \n",
       "249           2  3.687043  1.467271   107700  100783  7.459857e-04  \n",
       "12            3  5.138905  3.917056    90900   16242  2.361471e-03  \n",
       "107          11  4.850720  2.435138    90900   83771  3.827697e-03  \n",
       "102          42  3.744022  2.730794    10100   10100  6.445393e-03  \n",
       "166           2  4.111242  2.415512    57700   14148  1.770697e-02  \n",
       "133           5  3.383199  1.939016   532500  504211  2.750328e-02  \n",
       "185           1  1.130016  2.068239  1442700   61314  3.331929e-02  \n",
       "37           18  1.161357  0.720075   238500  238489  4.016257e-02  \n",
       "131           4  3.838754  2.930357   106500   72110  4.128653e-02  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Show nominally significant metapaths\n",
    "(\n",
    "    metapath_df\n",
    "    .sort_values('p_value')\n",
    "    .query(\"p_value < 0.05\")\n",
    "    [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>metapath</th>\n",
       "      <th>source_name</th>\n",
       "      <th>target_name</th>\n",
       "      <th>source_degree</th>\n",
       "      <th>target_degree</th>\n",
       "      <th>path_count</th>\n",
       "      <th>dwpc</th>\n",
       "      <th>mean_nz</th>\n",
       "      <th>n</th>\n",
       "      <th>nnz</th>\n",
       "      <th>p_value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>GuCpDrD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15928200</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>GuCtDrD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15928200</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>GuCbGaD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>0</td>\n",
       "      <td>373</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1769800</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>GuCbGuD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>0</td>\n",
       "      <td>74</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1769800</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>251</th>\n",
       "      <td>GpPWpGuD</td>\n",
       "      <td>FTO</td>\n",
       "      <td>obesity</td>\n",
       "      <td>1</td>\n",
       "      <td>74</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.658241</td>\n",
       "      <td>107700</td>\n",
       "      <td>56287</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     metapath source_name target_name  source_degree  target_degree  \\\n",
       "91    GuCpDrD         FTO     obesity              0              5   \n",
       "92    GuCtDrD         FTO     obesity              0              5   \n",
       "93    GuCbGaD         FTO     obesity              0            373   \n",
       "95    GuCbGuD         FTO     obesity              0             74   \n",
       "251  GpPWpGuD         FTO     obesity              1             74   \n",
       "\n",
       "     path_count  dwpc   mean_nz         n    nnz  p_value  \n",
       "91            0   0.0       NaN  15928200      0      1.0  \n",
       "92            0   0.0       NaN  15928200      0      1.0  \n",
       "93            0   0.0       NaN   1769800      0      1.0  \n",
       "95            0   0.0       NaN   1769800      0      1.0  \n",
       "251           0   0.0  1.658241    107700  56287      1.0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Non-significant metapaths\n",
    "(\n",
    "    metapath_df\n",
    "    .sort_values('p_value')\n",
    "    .tail()\n",
    "    [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hetionet Neo4j Queries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To create DWPC queries for other metapaths, you can use the following method:\n",
    "\n",
    "\n",
    "```python\n",
    "import hetio.neo4j\n",
    "metapath = hetmat.metagraph.get_metapath('GaDpSpD')\n",
    "query = hetio.neo4j.construct_dwpc_query(metapath)\n",
    "print(query)\n",
    "```\n",
    "\n",
    "\n",
    "## Top _GpBPpGaD_ paths\n",
    "\n",
    "```cypher\n",
    "MATCH path = (n0:Gene)-[:PARTICIPATES_GpBP]-(n1)-[:PARTICIPATES_GpBP]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
    "WHERE n0.name = 'FTO'\n",
    "  AND n3.name = 'obesity'\n",
    "AND n0 <> n2\n",
    "WITH [\n",
    "  size((n0)-[:PARTICIPATES_GpBP]-()),\n",
    "  size(()-[:PARTICIPATES_GpBP]-(n1)),\n",
    "  size((n1)-[:PARTICIPATES_GpBP]-()),\n",
    "  size(()-[:PARTICIPATES_GpBP]-(n2)),\n",
    "  size((n2)-[:ASSOCIATES_DaG]-()),\n",
    "  size(()-[:ASSOCIATES_DaG]-(n3))\n",
    "] AS degrees, path\n",
    "RETURN\n",
    "  path,\n",
    "  substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
    "  reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
    "ORDER BY pdp DESC\n",
    "LIMIT 10\n",
    "```\n",
    "\n",
    "## Top _GeAeGaD_ paths\n",
    "\n",
    "```cypher\n",
    "MATCH path = (n0:Gene)-[:EXPRESSES_AeG]-(n1)-[:EXPRESSES_AeG]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
    "WHERE n0.name = 'FTO'\n",
    "  AND n3.name = 'obesity'\n",
    "AND n0 <> n2\n",
    "WITH [\n",
    "size((n0)-[:EXPRESSES_AeG]-()),\n",
    "size(()-[:EXPRESSES_AeG]-(n1)),\n",
    "size((n1)-[:EXPRESSES_AeG]-()),\n",
    "size(()-[:EXPRESSES_AeG]-(n2)),\n",
    "size((n2)-[:ASSOCIATES_DaG]-()),\n",
    "size(()-[:ASSOCIATES_DaG]-(n3))\n",
    "] AS degrees, path\n",
    "RETURN\n",
    "  path,\n",
    "  substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
    "  reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
    "ORDER BY pdp DESC\n",
    "LIMIT 10\n",
    "```\n",
    "\n",
    "\n",
    "## Top _GaDpSpD_ paths\n",
    "\n",
    "```cypher\n",
    "MATCH path = (n0:Gene)-[:ASSOCIATES_DaG]-(n1)-[:PRESENTS_DpS]-(n2)-[:PRESENTS_DpS]-(n3:Disease)\n",
    "WHERE n0.name = 'FTO'\n",
    "  AND n3.name = 'obesity'\n",
    "AND n1 <> n3\n",
    "WITH [\n",
    "size((n0)-[:ASSOCIATES_DaG]-()),\n",
    "size(()-[:ASSOCIATES_DaG]-(n1)),\n",
    "size((n1)-[:PRESENTS_DpS]-()),\n",
    "size(()-[:PRESENTS_DpS]-(n2)),\n",
    "size((n2)-[:PRESENTS_DpS]-()),\n",
    "size(()-[:PRESENTS_DpS]-(n3))\n",
    "] AS degrees, path\n",
    "RETURN\n",
    "  path,\n",
    "  substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
    "  reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
    "ORDER BY pdp DESC\n",
    "LIMIT 10\n",
    "```"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:hetmech]",
   "language": "python",
   "name": "conda-env-hetmech-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}