Created
June 24, 2013 20:29
-
-
Save rameshvarun/5853333 to your computer and use it in GitHub Desktop.
Simple script to parse TCGA mRNA data and create a matrix of gene expression information.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
#Configuration | |
DELIM = "\t" | |
HEADERS = True | |
TRANSPOSED = True | |
class Sample: | |
def __init__(self): | |
self.expression = {} | |
def parseBarcode(self): | |
words = self.Barcode.split("-") | |
self.Project = words[0] | |
self.TSS = words[1] | |
self.Participant = words[2] | |
self.SampleVial = words[3] | |
self.PortionAnalyte = words[4] | |
self.Plate = words[5] | |
self.CenterCode = words[6] | |
file_manifest = open("file_manifest.txt", "r") | |
file_manifest_header = None | |
#Collection for storing all sample objects read in from the file manifest | |
samples = [] | |
#Read manifest file | |
for line in file_manifest: | |
if file_manifest_header == None: | |
file_manifest_header = [] | |
for column_name in line.split("\t"): | |
file_manifest_header.append( column_name.strip() ) | |
else: | |
words = line.split("\t") | |
sample = Sample() | |
sample.PlatformType = words[ file_manifest_header.index("Platform Type") ].strip() | |
sample.Center = words[ file_manifest_header.index("Center") ].strip() | |
sample.Platform = words[ file_manifest_header.index("Platform") ].strip() | |
sample.Level = words[ file_manifest_header.index("Level") ].strip() | |
sample.Sample = words[ file_manifest_header.index("Sample") ].strip() | |
sample.Barcode = words[ file_manifest_header.index("Barcode") ].strip() | |
sample.FileName = words[ file_manifest_header.index("File Name") ].strip() | |
if sample.PlatformType != "METADATA": | |
sample.parseBarcode() | |
samples.append( sample ) | |
file_manifest.close() | |
print( str( len(samples) ) + " samples in file manifest.") | |
print( "Trimming samples.") | |
#In case you want to trim specific samples | |
samples_to_evaluate = [] | |
for sample in samples: | |
samples_to_evaluate.append( sample ) | |
print( str( len(samples_to_evaluate) ) + " samples will be evaluated.") | |
genes = [] #A List of Unique Gene Names. This will also define the order in which genes appear in the final sample | |
count = 0 #To keep track of progress | |
for sample in samples_to_evaluate: | |
filename = sample.PlatformType + "/" + sample.Center + "__" + sample.Platform + "/Level_" + sample.Level + "/" + sample.FileName | |
sample_file = open(filename , "r") | |
sample_file_header = None | |
for line in sample_file: | |
if sample_file_header == None: | |
sample_file_header = [] | |
for column_name in line.split("\t"): | |
sample_file_header.append( column_name.strip() ) | |
else: | |
words = line.split("\t") | |
GENE = words[ sample_file_header.index("gene symbol") ].strip() | |
VALUE = words[ sample_file_header.index("value") ].strip() | |
if GENE not in genes: | |
genes.append(GENE) | |
if VALUE == "null": | |
VALUE = "0" | |
sample.expression[GENE] = VALUE | |
sample_file.close() | |
count += 1 | |
if count % 2 == 0: | |
print( str( (float(count)/len(samples_to_evaluate))*100 ) + "% done reading sample files." ) | |
print "Expression data successfully read." | |
print( str( len(genes) ) + " genes in the data.") | |
print "Trimming Genes..." | |
genes_to_evaluate = [] | |
for gene in genes: | |
genes_to_evaluate.append( gene ) | |
print( str( len(genes_to_evaluate) ) + " genes will be evaluated.") | |
print "Writing output..." | |
#Output Data File | |
output_file = open("output.txt", "w") | |
if TRANSPOSED == False: | |
if HEADERS: | |
output_file.write("SAMPLE_BARCODE") | |
for gene in genes_to_evaluate: | |
output_file.write(DELIM + gene) | |
output_file.write("\n") | |
for sample in samples_to_evaluate: | |
empty = True | |
if HEADERS: | |
output_file.write(sample.Barcode) | |
empty = False | |
for gene in genes_to_evaluate: | |
if empty == False: | |
output_file.write(DELIM) | |
else: | |
empty = False | |
output_file.write(sample.expression[ gene ] ) | |
output_file.write("\n") | |
else: | |
if HEADERS: | |
output_file.write("GENE_NAME") | |
for sample in samples_to_evaluate: | |
output_file.write(DELIM + sample.Barcode) | |
output_file.write("\n") | |
for gene in genes_to_evaluate: | |
empty = True | |
if HEADERS: | |
output_file.write(gene) | |
empty = False | |
for sample in samples_to_evaluate: | |
if empty == False: | |
output_file.write(DELIM) | |
else: | |
empty = False | |
output_file.write(sample.expression[ gene ] ) | |
output_file.write("\n") | |
output_file.flush() | |
print "Output matrix successfully created." | |
#A small segment to help with the dot graphics files that banjo generates | |
dot_helper = open("output.dot", "w") | |
for i in range( len(genes_to_evaluate) ): | |
dot_helper.write(str(i) + " [label=\"" + genes_to_evaluate[i] + "\"];") | |
dot_helper.write("\n") | |
dot_helper.flush() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment