Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/6ef84ee6940b4f28d85c to your computer and use it in GitHub Desktop.
Save afrendeiro/6ef84ee6940b4f28d85c to your computer and use it in GitHub Desktop.
Mining cosmicDB
#!/usr/bin/env python
# Mining "COSMIC: Catalogue Of Somatic Mutations In Cancer" database
import os
from subprocess import call
import pandas as pd
# Cosmic DB has requires login to download data, so neither BioMart or wget actually work.
call(["wget", "http://cancer.sanger.ac.uk/files/cosmic/current_release/CosmicCompleteExport.tsv.gz"])
call(["gzip", "-d", "CosmicCompleteExport.tsv.gz"])
# read CosmicDB data
cosmic = pd.read_csv('CosmicCompleteExport.tsv', delimiter="\t")
# read in epigenetic protein info
epi = pd.read_csv('epigenetic_proteins.ids.csv')
# intersect and write csvs
epi = epi[['class', 'Associated Gene Name', 'Ensembl Gene ID']]
merged = pd.merge(epi, cosmic, left_on = "Associated Gene Name", right_on = "Gene name")
merged.to_csv("epigenetic_proteins.cosmicDBMutations.csv", index = False)
tmp = merged.groupby('Gene name')['Primary site']
df = pd.DataFrame([
tmp.apply(len).index, # names
tmp.apply(len), # n. tumours
tmp.apply(unique).apply(len), # n. unique tissues
tmp.apply(unique).apply("|".join) # unique tissues
],
).T
df.columns = ["Associated Gene Name", "Entries Count", "Tissue Count", "Tissues"]
df.to_csv("epigenetic_proteins.cosmicDBMutations.tissues.csv", index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment