Skip to content

Instantly share code, notes, and snippets.

@PoisonAlien
Created January 9, 2021 08:53
Show Gist options
  • Save PoisonAlien/fa4199e34a089a873820fd46eba028df to your computer and use it in GitHub Desktop.
Save PoisonAlien/fa4199e34a089a873820fd46eba028df to your computer and use it in GitHub Desktop.
Generate
# Get the COSMIC variant file from here: https://cancer.sanger.ac.uk/cosmic/download (for. ex: CosmicCompleteTargetedScreensMutantExport.tsv.gz)
# You will have to register and sign in
# Readin only these selected columns: `Gene name GENOMIC_MUTATION_ID Mutation AA Mutation Description Mutation genome position SNP FATHMM prediction HGVSG`
cosm = data.table::fread(cmd = "zcat CosmicCompleteTargetedScreensMutantExport.tsv.gz | cut -f 1,17,21,22,26,28,30,40 | sed 1d | sort -k1,2", header = FALSE)
csom = cosm[!V2 %in% ""]
csom = csom[!V4 %in% "Substitution - coding silent"] #Remove silent variants
csom = csom[!V4 %in% ""] #Remove vars with no sub. type variants
csom[, id := paste0(V2, ":", V3)]
csom = csom[!duplicated(id)] #Remove dup entries
csom = csom[!V4 %in% "Unknown"]
csom = csom[!V6 %in% "y"] #Remove SNPs
csom = csom[V7 %in% "PATHOGENIC"] #Keep pathogenic vars
# Small func to convert `chr:10-11` to `chr start end`
loci2df = function(loci){
chr = as.character(unlist(data.table::tstrsplit(x = loci, spli = ":", keep = 1)))
start = unlist(data.table::tstrsplit(x = unlist(data.table::tstrsplit(x = loci, split = ":", keep = 2)), split = "-", keep = 1))
start = as.numeric(as.character(gsub(pattern = ",", replacement = "", x = as.character(start))))
end = unlist(data.table::tstrsplit(x = unlist(data.table::tstrsplit(x = loci, split = ":", keep = 2)), split = "-", keep = 2))
end = as.numeric(as.character(gsub(pattern = ",", replacement = "", x = as.character(end))))
data.table::data.table(chr, start, end)
}
locdf = loci2df(csom$V5)
locdf$type = csom$V4
locdf$Hugo_Symbol = csom$V1
locdf$HGVSp = csom$V3
locdf$COSMIC_ID = csom$V2
locdf$conv = substr(x = csom$V8, start = nchar(csom$V8)-2, stop = nchar(csom$V8))
locdf$ref = substr(x = locdf$conv, 1, 1)
locdf$alt = substr(x = locdf$conv, 3, 3)
locdf = locdf[,.(chr, start, ref, alt, Hugo_Symbol, type, HGVSp, COSMIC_ID)]
locdf = locdf[!chr %in% 24] #Remove chrY
locdf = locdf[order(Hugo_Symbol)][!duplicated(COSMIC_ID)]
locdf[,type := gsub(pattern = "Substitution - ", replacement = "", x = type)]
locdf=locdf[order(as.numeric(chr), start)]
data.table::fwrite(locdf, "COSMIC_nsyn_GRCh37.tsv", sep="\t", col.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment