Skip to content

Instantly share code, notes, and snippets.

@dhimmel
Created October 23, 2018 21:38
Show Gist options
  • Save dhimmel/aa552cffdbe057226b1b305dd15e552d to your computer and use it in GitHub Desktop.
Save dhimmel/aa552cffdbe057226b1b305dd15e552d to your computer and use it in GitHub Desktop.
Query relationship between the FTO gene and obesity using hetmech
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Query connectivity between two nodes"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import zipfile\n",
"import collections\n",
"\n",
"import numpy\n",
"import pandas\n",
"import tqdm\n",
"import scipy.sparse\n",
"\n",
"from hetmech.hetmat import HetMat\n",
"import hetmech.degree_group\n",
"import hetmech.pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read degree-grouped permutation archive info"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>archive</th>\n",
" <th>filename</th>\n",
" <th>file_size</th>\n",
" <th>compress_type</th>\n",
" <th>compress_size</th>\n",
" <th>CRC</th>\n",
" </tr>\n",
" <tr>\n",
" <th>metapath</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>AdG</th>\n",
" <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
" <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
" <td>29366</td>\n",
" <td>store</td>\n",
" <td>29366</td>\n",
" <td>1169061893</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AeG</th>\n",
" <td>degree-grouped-perms_length-1_damping-0.5-0000...</td>\n",
" <td>adjusted-path-counts/dwpc-0.5/degree-grouped-p...</td>\n",
" <td>321650</td>\n",
" <td>store</td>\n",
" <td>321650</td>\n",
" <td>2872114663</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" archive \\\n",
"metapath \n",
"AdG degree-grouped-perms_length-1_damping-0.5-0000... \n",
"AeG degree-grouped-perms_length-1_damping-0.5-0000... \n",
"\n",
" filename file_size \\\n",
"metapath \n",
"AdG adjusted-path-counts/dwpc-0.5/degree-grouped-p... 29366 \n",
"AeG adjusted-path-counts/dwpc-0.5/degree-grouped-p... 321650 \n",
"\n",
" compress_type compress_size CRC \n",
"metapath \n",
"AdG store 29366 1169061893 \n",
"AeG store 321650 2872114663 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read archive locations\n",
"archive_dir = pathlib.Path('../bulk-pipeline/archives-92f40fe')\n",
"dfs = list()\n",
"for length in range(1, 4):\n",
" path = archive_dir / f'degree-grouped-perms_length-{length}_damping-0.5.zip-info.tsv'\n",
" dfs.append(pandas.read_table(path))\n",
"dgp_info_df = pandas.concat(dfs)\n",
"dgp_info_df['metapath'] = dgp_info_df.filename.map(lambda x: x.rsplit('/', 1)[-1].split('.')[0])\n",
"dgp_info_df.set_index('metapath', inplace=True)\n",
"metapath_to_dgp_info = dict(dgp_info_df.iterrows())\n",
"dgp_info_df.head(2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define functions that will be moved upstream to the hetmech package"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def dwpc_to_degrees(graph, metapath, damping=0.5, index_pairs=[]):\n",
" \"\"\"\n",
" Yield a description of each cell in a DWPC matrix adding source and target\n",
" node degree info as well as the corresponding path count.\n",
" \"\"\"\n",
" metapath = graph.metagraph.get_metapath(metapath)\n",
" _, _, source_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[0], dense_threshold=0.7)\n",
" _, _, target_adj_mat = graph.metaedge_to_adjacency_matrix(metapath[-1], dense_threshold=0.7)\n",
" source_degrees = source_adj_mat.sum(axis=1).flat\n",
" target_degrees = target_adj_mat.sum(axis=0).flat\n",
" del source_adj_mat, target_adj_mat\n",
"\n",
" source_path = graph.get_nodes_path(metapath.source(), file_format='tsv')\n",
" source_node_df = pandas.read_table(source_path)\n",
" source_node_names = list(source_node_df['name'])\n",
"\n",
" target_path = graph.get_nodes_path(metapath.target(), file_format='tsv')\n",
" target_node_df = pandas.read_table(target_path)\n",
" target_node_names = list(target_node_df['name'])\n",
"\n",
" row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping)\n",
" dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean())\n",
" if scipy.sparse.issparse(dwpc_matrix):\n",
" dwpc_matrix = dwpc_matrix.toarray()\n",
"\n",
" _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0)\n",
" if scipy.sparse.issparse(path_count):\n",
" path_count = path_count.toarray()\n",
"\n",
" for row_ind, col_ind in index_pairs:\n",
" dwpc_value = dwpc_matrix[row_ind, col_ind]\n",
" row = {\n",
" 'source_id': row_names[row_ind],\n",
" 'target_id': col_names[col_ind],\n",
" 'source_name': source_node_names[row_ind],\n",
" 'target_name': target_node_names[col_ind],\n",
" 'source_degree': source_degrees[row_ind],\n",
" 'target_degree': target_degrees[col_ind],\n",
" 'path_count': path_count[row_ind, col_ind],\n",
" 'dwpc': dwpc_value,\n",
" }\n",
" yield collections.OrderedDict(row)\n",
"\n",
"\n",
"def combine_dwpc_dgp(graph, metapath, damping, index_pairs, max_p_value=1.0):\n",
" \"\"\"\n",
" Combine DWPC information with degree-grouped permutation summary metrics.\n",
" Includes gamma-hurdle significance estimates.\n",
" \"\"\"\n",
" # stats_path = graph.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz')\n",
" # dgp_df = pandas.read_table(stats_path)\n",
" try:\n",
" info = metapath_to_dgp_info[str(metapath)]\n",
" inverted = False\n",
" except KeyError:\n",
" info = metapath_to_dgp_info[str(metapath.inverse)]\n",
" inverted = True\n",
" path = archive_dir / info.archive\n",
" with zipfile.ZipFile(path) as zip_file:\n",
" with zip_file.open(info.filename) as read_file:\n",
" dgp_df = pandas.read_table(read_file, compression='gzip')\n",
" if inverted:\n",
" dgp_df = dgp_df.rename(columns={'source_degree': 'target_degree', 'target_degree': 'source_degree'})\n",
" dgp_df['mean_nz'] = dgp_df['sum'] / dgp_df['nnz']\n",
" dgp_df['sd_nz'] = ((dgp_df['sum_of_squares'] - dgp_df['sum'] ** 2 / dgp_df['nnz']) / (dgp_df['nnz'] - 1)) ** 0.5\n",
" dgp_df['beta'] = dgp_df['mean_nz'] / dgp_df['sd_nz'] ** 2\n",
" dgp_df['alpha'] = dgp_df['mean_nz'] * dgp_df['beta']\n",
" degrees_to_dgp = dgp_df.set_index(['source_degree', 'target_degree']).to_dict(orient='index')\n",
" dwpc_row_generator = dwpc_to_degrees(\n",
" graph, metapath, damping=damping, index_pairs=index_pairs)\n",
" for row in dwpc_row_generator:\n",
" degrees = row['source_degree'], row['target_degree']\n",
" dgp = degrees_to_dgp[degrees]\n",
" row.update(dgp)\n",
" if row['path_count'] == 0:\n",
" row['p_value'] = 1.0\n",
" else:\n",
" row['p_value'] = None if row['sum'] == 0 else (\n",
" row['nnz'] / row['n'] *\n",
" (1 - scipy.special.gammainc(row['alpha'], row['beta'] * row['dwpc']))\n",
" )\n",
" if row['p_value'] is not None and row['p_value'] > max_p_value:\n",
" continue\n",
" for key in ['sum', 'sum_of_squares', 'beta', 'alpha']:\n",
" del row[key]\n",
" yield row\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify parameters"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"hetmat = HetMat('../../data/hetionet-v1.0.hetmat/')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"source_node = 'Gene', 79068 # FTO Gene\n",
"target_node = 'Disease', 'DOID:9970' # Obesity\n",
"\n",
"# set DWPC damping exponent\n",
"damping = 0.5"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"252"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metapaths = hetmat.metagraph.extract_metapaths(source_node[0], target_node[0], max_length=3)\n",
"len(metapaths)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(12358, 136)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source_index = hetmat.get_node_identifiers(source_node[0]).index(source_node[1])\n",
"target_index = hetmat.get_node_identifiers(target_node[0]).index(target_node[1])\n",
"source_index, target_index"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute adjusted DWPCs and p-values"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a05f10944ae42049304f9328979b1f6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=252), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"rows = list()\n",
"for metapath in tqdm.tqdm_notebook(metapaths):\n",
" index_pairs = [(source_index, target_index)]\n",
" for row in combine_dwpc_dgp(hetmat, metapath, damping, index_pairs=index_pairs):\n",
" row['metapath'] = str(metapath)\n",
" rows.append(row)\n",
"metapath_df = pandas.DataFrame(rows)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>source_id</th>\n",
" <th>target_id</th>\n",
" <th>source_name</th>\n",
" <th>target_name</th>\n",
" <th>source_degree</th>\n",
" <th>target_degree</th>\n",
" <th>path_count</th>\n",
" <th>dwpc</th>\n",
" <th>n</th>\n",
" <th>nnz</th>\n",
" <th>n_perms</th>\n",
" <th>mean_nz</th>\n",
" <th>sd_nz</th>\n",
" <th>p_value</th>\n",
" <th>metapath</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>79068</td>\n",
" <td>DOID:9970</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>373</td>\n",
" <td>1</td>\n",
" <td>5.267578</td>\n",
" <td>10100</td>\n",
" <td>1740</td>\n",
" <td>100</td>\n",
" <td>5.267578</td>\n",
" <td>6.468376e-08</td>\n",
" <td>0.086139</td>\n",
" <td>GaD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>79068</td>\n",
" <td>DOID:9970</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>2</td>\n",
" <td>45</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>106500</td>\n",
" <td>1220</td>\n",
" <td>100</td>\n",
" <td>7.200037</td>\n",
" <td>1.727540e-07</td>\n",
" <td>1.000000</td>\n",
" <td>GdD</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" source_id target_id source_name target_name source_degree target_degree \\\n",
"0 79068 DOID:9970 FTO obesity 6 373 \n",
"1 79068 DOID:9970 FTO obesity 2 45 \n",
"\n",
" path_count dwpc n nnz n_perms mean_nz sd_nz \\\n",
"0 1 5.267578 10100 1740 100 5.267578 6.468376e-08 \n",
"1 0 0.000000 106500 1220 100 7.200037 1.727540e-07 \n",
"\n",
" p_value metapath \n",
"0 0.086139 GaD \n",
"1 1.000000 GdD "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metapath_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>metapath</th>\n",
" <th>source_name</th>\n",
" <th>target_name</th>\n",
" <th>source_degree</th>\n",
" <th>target_degree</th>\n",
" <th>path_count</th>\n",
" <th>dwpc</th>\n",
" <th>mean_nz</th>\n",
" <th>n</th>\n",
" <th>nnz</th>\n",
" <th>p_value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>GpBPpGaD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>32</td>\n",
" <td>373</td>\n",
" <td>435</td>\n",
" <td>2.814122</td>\n",
" <td>2.100517</td>\n",
" <td>14500</td>\n",
" <td>14500</td>\n",
" <td>4.747076e-08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>GeAeGaD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>28</td>\n",
" <td>373</td>\n",
" <td>6204</td>\n",
" <td>2.002286</td>\n",
" <td>1.870643</td>\n",
" <td>26500</td>\n",
" <td>26500</td>\n",
" <td>7.739905e-08</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>GaDaGaD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>373</td>\n",
" <td>280</td>\n",
" <td>4.283209</td>\n",
" <td>3.463896</td>\n",
" <td>10100</td>\n",
" <td>10100</td>\n",
" <td>3.328533e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>GaDpSpD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>17</td>\n",
" <td>25</td>\n",
" <td>4.434438</td>\n",
" <td>2.443015</td>\n",
" <td>50500</td>\n",
" <td>50498</td>\n",
" <td>1.351195e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>GpPWpGaD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>1</td>\n",
" <td>373</td>\n",
" <td>2</td>\n",
" <td>3.687043</td>\n",
" <td>1.467271</td>\n",
" <td>107700</td>\n",
" <td>100783</td>\n",
" <td>7.459857e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>GaDrD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>5.138905</td>\n",
" <td>3.917056</td>\n",
" <td>90900</td>\n",
" <td>16242</td>\n",
" <td>2.361471e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>GaDrDrD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>11</td>\n",
" <td>4.850720</td>\n",
" <td>2.435138</td>\n",
" <td>90900</td>\n",
" <td>83771</td>\n",
" <td>3.827697e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>GaDlAlD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>33</td>\n",
" <td>42</td>\n",
" <td>3.744022</td>\n",
" <td>2.730794</td>\n",
" <td>10100</td>\n",
" <td>10100</td>\n",
" <td>6.445393e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>GcGiGdD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>6</td>\n",
" <td>45</td>\n",
" <td>2</td>\n",
" <td>4.111242</td>\n",
" <td>2.415512</td>\n",
" <td>57700</td>\n",
" <td>14148</td>\n",
" <td>1.770697e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>133</th>\n",
" <td>GdDpSpD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>2</td>\n",
" <td>17</td>\n",
" <td>5</td>\n",
" <td>3.383199</td>\n",
" <td>1.939016</td>\n",
" <td>532500</td>\n",
" <td>504211</td>\n",
" <td>2.750328e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>185</th>\n",
" <td>GiGuDrD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1.130016</td>\n",
" <td>2.068239</td>\n",
" <td>1442700</td>\n",
" <td>61314</td>\n",
" <td>3.331929e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>GeAlDrD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>28</td>\n",
" <td>5</td>\n",
" <td>18</td>\n",
" <td>1.161357</td>\n",
" <td>0.720075</td>\n",
" <td>238500</td>\n",
" <td>238489</td>\n",
" <td>4.016257e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>GdDuGdD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>2</td>\n",
" <td>45</td>\n",
" <td>4</td>\n",
" <td>3.838754</td>\n",
" <td>2.930357</td>\n",
" <td>106500</td>\n",
" <td>72110</td>\n",
" <td>4.128653e-02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" metapath source_name target_name source_degree target_degree \\\n",
"57 GpBPpGaD FTO obesity 32 373 \n",
"41 GeAeGaD FTO obesity 28 373 \n",
"108 GaDaGaD FTO obesity 6 373 \n",
"117 GaDpSpD FTO obesity 6 17 \n",
"249 GpPWpGaD FTO obesity 1 373 \n",
"12 GaDrD FTO obesity 6 5 \n",
"107 GaDrDrD FTO obesity 6 5 \n",
"102 GaDlAlD FTO obesity 6 33 \n",
"166 GcGiGdD FTO obesity 6 45 \n",
"133 GdDpSpD FTO obesity 2 17 \n",
"185 GiGuDrD FTO obesity 2 5 \n",
"37 GeAlDrD FTO obesity 28 5 \n",
"131 GdDuGdD FTO obesity 2 45 \n",
"\n",
" path_count dwpc mean_nz n nnz p_value \n",
"57 435 2.814122 2.100517 14500 14500 4.747076e-08 \n",
"41 6204 2.002286 1.870643 26500 26500 7.739905e-08 \n",
"108 280 4.283209 3.463896 10100 10100 3.328533e-07 \n",
"117 25 4.434438 2.443015 50500 50498 1.351195e-04 \n",
"249 2 3.687043 1.467271 107700 100783 7.459857e-04 \n",
"12 3 5.138905 3.917056 90900 16242 2.361471e-03 \n",
"107 11 4.850720 2.435138 90900 83771 3.827697e-03 \n",
"102 42 3.744022 2.730794 10100 10100 6.445393e-03 \n",
"166 2 4.111242 2.415512 57700 14148 1.770697e-02 \n",
"133 5 3.383199 1.939016 532500 504211 2.750328e-02 \n",
"185 1 1.130016 2.068239 1442700 61314 3.331929e-02 \n",
"37 18 1.161357 0.720075 238500 238489 4.016257e-02 \n",
"131 4 3.838754 2.930357 106500 72110 4.128653e-02 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Show nominally significant metapaths\n",
"(\n",
" metapath_df\n",
" .sort_values('p_value')\n",
" .query(\"p_value < 0.05\")\n",
" [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>metapath</th>\n",
" <th>source_name</th>\n",
" <th>target_name</th>\n",
" <th>source_degree</th>\n",
" <th>target_degree</th>\n",
" <th>path_count</th>\n",
" <th>dwpc</th>\n",
" <th>mean_nz</th>\n",
" <th>n</th>\n",
" <th>nnz</th>\n",
" <th>p_value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>GuCpDrD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>15928200</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>GuCtDrD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>15928200</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>GuCbGaD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>0</td>\n",
" <td>373</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1769800</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>GuCbGuD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>0</td>\n",
" <td>74</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1769800</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>GpPWpGuD</td>\n",
" <td>FTO</td>\n",
" <td>obesity</td>\n",
" <td>1</td>\n",
" <td>74</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.658241</td>\n",
" <td>107700</td>\n",
" <td>56287</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" metapath source_name target_name source_degree target_degree \\\n",
"91 GuCpDrD FTO obesity 0 5 \n",
"92 GuCtDrD FTO obesity 0 5 \n",
"93 GuCbGaD FTO obesity 0 373 \n",
"95 GuCbGuD FTO obesity 0 74 \n",
"251 GpPWpGuD FTO obesity 1 74 \n",
"\n",
" path_count dwpc mean_nz n nnz p_value \n",
"91 0 0.0 NaN 15928200 0 1.0 \n",
"92 0 0.0 NaN 15928200 0 1.0 \n",
"93 0 0.0 NaN 1769800 0 1.0 \n",
"95 0 0.0 NaN 1769800 0 1.0 \n",
"251 0 0.0 1.658241 107700 56287 1.0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Non-significant metapaths\n",
"(\n",
" metapath_df\n",
" .sort_values('p_value')\n",
" .tail()\n",
" [['metapath', 'source_name', 'target_name', 'source_degree', 'target_degree', 'path_count', 'dwpc', 'mean_nz', 'n', 'nnz', 'p_value']]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hetionet Neo4j Queries"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To create DWPC queries for other metapaths, you can use the following method:\n",
"\n",
"\n",
"```python\n",
"import hetio.neo4j\n",
"metapath = hetmat.metagraph.get_metapath('GaDpSpD')\n",
"query = hetio.neo4j.construct_dwpc_query(metapath)\n",
"print(query)\n",
"```\n",
"\n",
"\n",
"## Top _GpBPpGaD_ paths\n",
"\n",
"```cypher\n",
"MATCH path = (n0:Gene)-[:PARTICIPATES_GpBP]-(n1)-[:PARTICIPATES_GpBP]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
"WHERE n0.name = 'FTO'\n",
" AND n3.name = 'obesity'\n",
"AND n0 <> n2\n",
"WITH [\n",
" size((n0)-[:PARTICIPATES_GpBP]-()),\n",
" size(()-[:PARTICIPATES_GpBP]-(n1)),\n",
" size((n1)-[:PARTICIPATES_GpBP]-()),\n",
" size(()-[:PARTICIPATES_GpBP]-(n2)),\n",
" size((n2)-[:ASSOCIATES_DaG]-()),\n",
" size(()-[:ASSOCIATES_DaG]-(n3))\n",
"] AS degrees, path\n",
"RETURN\n",
" path,\n",
" substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
" reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
"ORDER BY pdp DESC\n",
"LIMIT 10\n",
"```\n",
"\n",
"## Top _GeAeGaD_ paths\n",
"\n",
"```cypher\n",
"MATCH path = (n0:Gene)-[:EXPRESSES_AeG]-(n1)-[:EXPRESSES_AeG]-(n2)-[:ASSOCIATES_DaG]-(n3:Disease)\n",
"WHERE n0.name = 'FTO'\n",
" AND n3.name = 'obesity'\n",
"AND n0 <> n2\n",
"WITH [\n",
"size((n0)-[:EXPRESSES_AeG]-()),\n",
"size(()-[:EXPRESSES_AeG]-(n1)),\n",
"size((n1)-[:EXPRESSES_AeG]-()),\n",
"size(()-[:EXPRESSES_AeG]-(n2)),\n",
"size((n2)-[:ASSOCIATES_DaG]-()),\n",
"size(()-[:ASSOCIATES_DaG]-(n3))\n",
"] AS degrees, path\n",
"RETURN\n",
" path,\n",
" substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
" reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
"ORDER BY pdp DESC\n",
"LIMIT 10\n",
"```\n",
"\n",
"\n",
"## Top _GaDpSpD_ paths\n",
"\n",
"```cypher\n",
"MATCH path = (n0:Gene)-[:ASSOCIATES_DaG]-(n1)-[:PRESENTS_DpS]-(n2)-[:PRESENTS_DpS]-(n3:Disease)\n",
"WHERE n0.name = 'FTO'\n",
" AND n3.name = 'obesity'\n",
"AND n1 <> n3\n",
"WITH [\n",
"size((n0)-[:ASSOCIATES_DaG]-()),\n",
"size(()-[:ASSOCIATES_DaG]-(n1)),\n",
"size((n1)-[:PRESENTS_DpS]-()),\n",
"size(()-[:PRESENTS_DpS]-(n2)),\n",
"size((n2)-[:PRESENTS_DpS]-()),\n",
"size(()-[:PRESENTS_DpS]-(n3))\n",
"] AS degrees, path\n",
"RETURN\n",
" path,\n",
" substring(reduce(string = '', node IN nodes(path) | string + '—' + node.name), 1) AS nodes,\n",
" reduce(pdp = 1.0, d in degrees | pdp * d ^ -0.5) AS pdp\n",
"ORDER BY pdp DESC\n",
"LIMIT 10\n",
"```"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:hetmech]",
"language": "python",
"name": "conda-env-hetmech-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment