Skip to content

Instantly share code, notes, and snippets.

@rameshvarun
Created June 24, 2013 20:29
Show Gist options
  • Save rameshvarun/5853333 to your computer and use it in GitHub Desktop.
Save rameshvarun/5853333 to your computer and use it in GitHub Desktop.
Simple script to parse TCGA mRNA data and create a matrix of gene expression information.
import os
#Configuration
DELIM = "\t"
HEADERS = True
TRANSPOSED = True
class Sample:
def __init__(self):
self.expression = {}
def parseBarcode(self):
words = self.Barcode.split("-")
self.Project = words[0]
self.TSS = words[1]
self.Participant = words[2]
self.SampleVial = words[3]
self.PortionAnalyte = words[4]
self.Plate = words[5]
self.CenterCode = words[6]
file_manifest = open("file_manifest.txt", "r")
file_manifest_header = None
#Collection for storing all sample objects read in from the file manifest
samples = []
#Read manifest file
for line in file_manifest:
if file_manifest_header == None:
file_manifest_header = []
for column_name in line.split("\t"):
file_manifest_header.append( column_name.strip() )
else:
words = line.split("\t")
sample = Sample()
sample.PlatformType = words[ file_manifest_header.index("Platform Type") ].strip()
sample.Center = words[ file_manifest_header.index("Center") ].strip()
sample.Platform = words[ file_manifest_header.index("Platform") ].strip()
sample.Level = words[ file_manifest_header.index("Level") ].strip()
sample.Sample = words[ file_manifest_header.index("Sample") ].strip()
sample.Barcode = words[ file_manifest_header.index("Barcode") ].strip()
sample.FileName = words[ file_manifest_header.index("File Name") ].strip()
if sample.PlatformType != "METADATA":
sample.parseBarcode()
samples.append( sample )
file_manifest.close()
print( str( len(samples) ) + " samples in file manifest.")
print( "Trimming samples.")
#In case you want to trim specific samples
samples_to_evaluate = []
for sample in samples:
samples_to_evaluate.append( sample )
print( str( len(samples_to_evaluate) ) + " samples will be evaluated.")
genes = [] #A List of Unique Gene Names. This will also define the order in which genes appear in the final sample
count = 0 #To keep track of progress
for sample in samples_to_evaluate:
filename = sample.PlatformType + "/" + sample.Center + "__" + sample.Platform + "/Level_" + sample.Level + "/" + sample.FileName
sample_file = open(filename , "r")
sample_file_header = None
for line in sample_file:
if sample_file_header == None:
sample_file_header = []
for column_name in line.split("\t"):
sample_file_header.append( column_name.strip() )
else:
words = line.split("\t")
GENE = words[ sample_file_header.index("gene symbol") ].strip()
VALUE = words[ sample_file_header.index("value") ].strip()
if GENE not in genes:
genes.append(GENE)
if VALUE == "null":
VALUE = "0"
sample.expression[GENE] = VALUE
sample_file.close()
count += 1
if count % 2 == 0:
print( str( (float(count)/len(samples_to_evaluate))*100 ) + "% done reading sample files." )
print "Expression data successfully read."
print( str( len(genes) ) + " genes in the data.")
print "Trimming Genes..."
genes_to_evaluate = []
for gene in genes:
genes_to_evaluate.append( gene )
print( str( len(genes_to_evaluate) ) + " genes will be evaluated.")
print "Writing output..."
#Output Data File
output_file = open("output.txt", "w")
if TRANSPOSED == False:
if HEADERS:
output_file.write("SAMPLE_BARCODE")
for gene in genes_to_evaluate:
output_file.write(DELIM + gene)
output_file.write("\n")
for sample in samples_to_evaluate:
empty = True
if HEADERS:
output_file.write(sample.Barcode)
empty = False
for gene in genes_to_evaluate:
if empty == False:
output_file.write(DELIM)
else:
empty = False
output_file.write(sample.expression[ gene ] )
output_file.write("\n")
else:
if HEADERS:
output_file.write("GENE_NAME")
for sample in samples_to_evaluate:
output_file.write(DELIM + sample.Barcode)
output_file.write("\n")
for gene in genes_to_evaluate:
empty = True
if HEADERS:
output_file.write(gene)
empty = False
for sample in samples_to_evaluate:
if empty == False:
output_file.write(DELIM)
else:
empty = False
output_file.write(sample.expression[ gene ] )
output_file.write("\n")
output_file.flush()
print "Output matrix successfully created."
#A small segment to help with the dot graphics files that banjo generates
dot_helper = open("output.dot", "w")
for i in range( len(genes_to_evaluate) ):
dot_helper.write(str(i) + " [label=\"" + genes_to_evaluate[i] + "\"];")
dot_helper.write("\n")
dot_helper.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment