Skip to content

Instantly share code, notes, and snippets.

@callahantiff
Last active March 18, 2020 02:33
Show Gist options
  • Save callahantiff/23a8a62dece793a6f46b8464a5deddcd to your computer and use it in GitHub Desktop.
Save callahantiff/23a8a62dece793a6f46b8464a5deddcd to your computer and use it in GitHub Desktop.
Pacific Northwest National Laboratory Collaboration
############
## PURPOSE: Generate gene-centric edge lists for collaboration with PNNL
## Master Repo: https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0
## Edge Data: https://github.com/callahantiff/PheKnowLator/wiki/v2-Data-Sources
###########
# import needed libraries
import pandas
import pickle
import json
import urllib.request
from tqdm import tqdm
## READ IN DATA
# download and read in edge data
url1 = 'https://www.dropbox.com/s/8pqj0ft88rvpjr3/Master_Edge_List_Dict.json?dl=1'
edge_data = json.loads(urllib.request.urlopen(url1).read().decode())
# read in gene-protein map
url2 = 'https://www.dropbox.com/s/lrw2zmwrovzukzl/gene-protein_ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt?dl=1'
gene_map = pandas.read_csv(url2, header=None, sep='\t')
gene_map[2] = gene_map[2].apply(lambda i: 'not protein-coding' if i != 'protein-coding' else 'protein-coding')
# convert df to dictionary
gene_map_dict = {}
for idx, row in tqdm(gene_map.iterrows(), total=gene_map.shape[0]):
if row[1] in gene_map_dict.keys():
gene_map_dict[row[1]]['gene'].append(row[0])
gene_map_dict[row[1]]['gene_type'].append(row[2] if row[2] == 'protein-coding' else 'not protein-coding')
else:
gene_map_dict[row[1]] = {}
gene_map_dict[row[1]]['gene'] = [row[0]]
gene_map_dict[row[1]]['gene_type'] = [row[2] if row[2] == 'protein-coding' else 'not protein-coding']
## CREATE EDGE LISTS
edge_types_of_interest = ['protein-gobp', 'protein-pathway', 'chemical-protein', 'gene-disease']
# create edge lists
ordered_edge_list = {}
gene_type = {}
for edge_type in tqdm(edge_types_of_interest):
ordered_edge_list[edge_type] = {}
temp_edge_dict = {}
for edge in tqdm(edge_data[edge_type]['edge_list']):
# set subject and object identifiers
if 'protein' in edge_type and edge[edge_type.split('-').index('protein')] in gene_map_dict.keys():
protein_idx = edge_type.split('-').index('protein')
protein = edge[protein_idx]
sbj, obj = edge[1 - protein_idx], gene_map_dict[edge[protein_idx]]['gene'][0]
# update gene metadata
gene_type[obj] = gene_map_dict[edge[protein_idx]]['gene_type'][0]
elif 'gene' in edge_type:
sbj, obj = edge[1 - edge_type.split('-').index('gene')], edge[edge_type.split('-').index('gene')]
# update gene metadata
x = [gene_map_dict[x]['gene_type'][0] for x in gene_map_dict.keys() if gene_map_dict[x]['gene'][0] == gene]
gene_type[obj] = x[0]
else:
continue
# add edge info to dictionary
if sbj in temp_edge_dict.keys():
temp_edge_dict[sbj].append(obj)
else:
temp_edge_dict[sbj] = [obj]
# add result dictionary to master dictionary
ordered_edge_list[edge_type] = temp_edge_dict
## GET COUNTS OF GENE SETS
# Get counts of genes by subject type
cleaned_dict, total_unique_genes = {}, set()
for edge_type in tqdm(ordered_edge_list.keys()):
# get edge counts
print('*** EDGE TYPE: {} ***'.format(edge_type))
subject, gene_edges = set(), []
for edge in ordered_edge_list[edge_type].keys():
# add all entries with more than 1 gene
# if len(ordered_edge_list[edge_type][edge]) > 1:
cleaned_dict.update({edge: list(set(ordered_edge_list[edge_type][edge]))})
gene_edges.append(list(set(ordered_edge_list[edge_type][edge])))
subject |= {edge}
total_unique_genes |= {edge}
print('Subject Count: {}'.format(len(set(subject))))
print('Gene Count - Min Gene Set Size: {}'.format(min([len(x) for x in gene_edges])))
print('Gene Count - Average Gene Set Size: {}'.format(sum([len(x) for x in gene_edges]) / len(gene_edges)))
print('Gene Count - Max Gene Set Size: {}'.format(max([len(x) for x in gene_edges])))
idx = [1 - edge_type.split('-').index('protein') if 'pro' in edge_type else 1 - edge_type.split('-').index('gene')]
subj_type = edge_type.split('-')[idx[0]]
set_count = len(set([x for y in gene_edges for x in y]))
print('Gene Count - Total Unique Genes for all {} sets: {}'.format(subj_type, set_count))
print('\n')
print('Gene Count: {}'.format(len(total_unique_genes)))
## WRITE RESULTS
pickle.dump(cleaned_dict, open('PheKnowLator_GeneLists_5March2020.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(gene_type, open('PheKnowLator_GeneMetadata_5March2020.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
@callahantiff
Copy link
Author

callahantiff commented Mar 5, 2020

Data Used on March 5, 2020:
INPUT DATA:


OUTPUT DATA:


RESULTS:
Total Unique Genes Across All Subject's Gene Sets: 17,806

Subject Subject Count Min Gene Set Size Avg gene Set Size Max gene Set Size Total Unique Genes
GO Biological Process 12,305 1 11.1975 1,133 17,252
Reactome Pathways 2,291 1 48.1336 2,732 10,677
Chemicals 3,289 1 14.8437 1,567 6,445
Diseases 2,683 1 15.7499 808 5,203

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment