Last active
August 29, 2015 14:12
-
-
Save afrendeiro/6ef84ee6940b4f28d85c to your computer and use it in GitHub Desktop.
Mining cosmicDB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Mining "COSMIC: Catalogue Of Somatic Mutations In Cancer" database | |
import os | |
from subprocess import call | |
import pandas as pd | |
# Cosmic DB has requires login to download data, so neither BioMart or wget actually work. | |
call(["wget", "http://cancer.sanger.ac.uk/files/cosmic/current_release/CosmicCompleteExport.tsv.gz"]) | |
call(["gzip", "-d", "CosmicCompleteExport.tsv.gz"]) | |
# read CosmicDB data | |
cosmic = pd.read_csv('CosmicCompleteExport.tsv', delimiter="\t") | |
# read in epigenetic protein info | |
epi = pd.read_csv('epigenetic_proteins.ids.csv') | |
# intersect and write csvs | |
epi = epi[['class', 'Associated Gene Name', 'Ensembl Gene ID']] | |
merged = pd.merge(epi, cosmic, left_on = "Associated Gene Name", right_on = "Gene name") | |
merged.to_csv("epigenetic_proteins.cosmicDBMutations.csv", index = False) | |
tmp = merged.groupby('Gene name')['Primary site'] | |
df = pd.DataFrame([ | |
tmp.apply(len).index, # names | |
tmp.apply(len), # n. tumours | |
tmp.apply(unique).apply(len), # n. unique tissues | |
tmp.apply(unique).apply("|".join) # unique tissues | |
], | |
).T | |
df.columns = ["Associated Gene Name", "Entries Count", "Tissue Count", "Tissues"] | |
df.to_csv("epigenetic_proteins.cosmicDBMutations.tissues.csv", index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment