Created
April 5, 2019 08:05
-
-
Save saarthak24/8ffa73138258257e12002cfca6cb00b6 to your computer and use it in GitHub Desktop.
Created while conducting research for project on "Identifying and Ranking Amygdala Genes Linked to Autism Spectrum Disorder" at Children's National Health System
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import numpy as np | |
import csv | |
np.set_printoptions(threshold=np.nan) | |
data = open("Data/GPL5175-3188.txt") | |
# Data for genes and associated IDs | |
geneList = data.read().split("\n") | |
data.close() | |
geneDict = {} | |
for i in range(13, 316932): | |
geneDict[geneList[i].split("\t")[0]] = geneList[i].split("\t")[9] | |
data = open("Data/GSE25219-GPL5175_series_matrix.txt") | |
# Detailed data for each gene | |
geneData = data.read().split("\n") | |
data.close() | |
SFARIlist = defaultdict(list) | |
# Import SFARI gene list | |
data = open("Data/SFARI.csv") | |
SFARI = data.read().split("\n") | |
data.close() | |
for i in range(1, 281): | |
dataRow = SFARI[i].split('\t') | |
SFARIlist[dataRow[0].split(',')[1]].append(dataRow[0].split(',')[(len(dataRow)-4)]) | |
#print(SFARIlist) | |
#print(len(SFARIlist)) | |
SFARIset = set(SFARIlist) | |
#print(len(SFARIset)) | |
SFARIKang = defaultdict(list) | |
for i in SFARIlist: | |
for key, value in geneDict.items(): | |
# check if value contains substring | |
if i in value: | |
SFARIKang[i].append(key) | |
#print(SFARIKang) | |
print(len(SFARIKang)) | |
Walshlist = [] | |
# Import Walsh amygdala gene list | |
data = open("Data/Walsh Proband Amygdala Genes.csv") | |
Walsh = data.read().split("\n") | |
data.close() | |
for i in range(1, 341): | |
dataRow = Walsh[i].split('\t') | |
Walshlist.append(dataRow[0].split(',')[4]) | |
#print(len(Walshlist)) | |
Walshset = set(Walshlist) | |
#print(len(Walshset)) | |
WalshKang = defaultdict(list) | |
for i in Walshlist: | |
for key, value in geneDict.items(): | |
# check if value contains substring | |
if i in value: | |
WalshKang[i].append(key) | |
#print(WalshKang) | |
print(len(WalshKang)) | |
SFARIWalshKang = {**SFARIKang, **WalshKang} | |
#print(SFARIWalshKang) | |
print(len(SFARIWalshKang)) | |
# Make matrix of samples with relavant information | |
sample_titles = geneData[67].replace('"', '').split('\t') | |
sample_geo_accessions = geneData[61].replace('"', '').split('\t') | |
sample_organism = geneData[68].replace('"', '').split('\t') | |
sample_braincode = geneData[69].replace( | |
'"', '').replace('brain code: ', '').split('\t') | |
sample_region = geneData[70].replace( | |
'"', '').replace('region: ', '').split('\t') | |
sample_hemisphere = geneData[71].replace( | |
'"', '').replace('hemisphere: ', '').split('\t') | |
sample_sex = geneData[72].replace('"', '').replace('Sex: ', '').split('\t') | |
sample_age = geneData[73].replace('"', '').replace('age: ', '').split('\t') | |
sample_stage = geneData[74].replace('"', '').replace('Stage: ', '').split('\t') | |
sample_postmortem_interval = geneData[75].replace( | |
'"', '').replace('postmortem interval: ', '').split('\t') | |
sample_ph = geneData[76].replace('"', '').replace('ph: ', '').split('\t') | |
sample_rna_integrity = geneData[77].replace('"', '').replace( | |
'rna integrity number: ', '').split('\t') | |
samples = np.column_stack((sample_titles, sample_geo_accessions, sample_organism, sample_braincode, sample_region, | |
sample_hemisphere, sample_sex, sample_age, sample_stage, sample_postmortem_interval, sample_ph, sample_rna_integrity)) | |
# print(samples) | |
results = [] | |
rows = [] | |
csv = open('Gene List.csv', "w") | |
headers = "Gene Name, SFARI Rank [S-3], Period 2, Period 3, Period 4, Period 5, Period 6, Period 7, Period 8, Average Expression, Ratio, Walsh \n" | |
csv.write(headers) | |
for i in range(107, 17672): #17672 | |
Gene = key | |
dataRow = geneData[i].replace('"', '').split('\t') | |
total = [] | |
P2total = [] | |
P3total = [] | |
P4total = [] | |
P5total = [] | |
P6total = [] | |
P7total = [] | |
P8total = [] | |
for x in range(1, len(dataRow)): | |
total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 2)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P2total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 3)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P3total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 4)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P4total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 5)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P5total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 6)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P6total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 7)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P7total.append(float(dataRow[x])) | |
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 8)): | |
if(float(dataRow[x]) >= 6.0): | |
results.append(dataRow[0]) | |
P8total.append(float(dataRow[x])) | |
P2 = 0.0 | |
P3 = 0.0 | |
P4 = 0.0 | |
P5 = 0.0 | |
P6 = 0.0 | |
P7 = 0.0 | |
P8 = 0.0 | |
if(len(P2total) != 0): | |
P2 = np.sum(P2total)/len(P2total) | |
if(len(P3total) != 0): | |
P3 = np.sum(P3total)/len(P3total) | |
if(len(P4total) != 0): | |
P4 = np.sum(P4total)/len(P4total) | |
if(len(P5total) != 0): | |
P5 = np.sum(P5total)/len(P5total) | |
if(len(P6total) != 0): | |
P6 = np.sum(P6total)/len(P6total) | |
if(len(P7total) != 0): | |
P7 = np.sum(P7total)/len(P7total) | |
if(len(P8total) != 0): | |
P8 = np.sum(P8total)/len(P8total) | |
amyAvg = (P2 + P3 + P4 + P5 + P6 + P7 + P8)/7.0 | |
totalAvg = np.sum(total)/len(total) | |
Ratio = amyAvg/totalAvg | |
if(dataRow[0] in results): | |
rowTemp = [dataRow[0], P2, P3, P4, P5, P6, P7, P8, amyAvg, Ratio] | |
rows.append(rowTemp) | |
print(len(rows)) | |
GenesFiltered = defaultdict(list) | |
for i in rows: | |
for key, value in SFARIWalshKang.items(): | |
if(i[0] in value): | |
GenesFiltered[key].append(i) | |
#print(GenesFiltered) | |
#print(len(GenesFiltered)) | |
for key, value in GenesFiltered.items(): | |
SFARI = SFARIlist.get(key, "No")[0] | |
Walsh = "No" | |
if(key in Walshlist): | |
Walsh = "Yes" | |
row = key + "," + SFARI + "," + str(round((value[0])[1], 3)) + "," + str(round((value[0])[2], 3)) + "," + str(round((value[0])[3], 3)) + "," + str(round((value[0])[4], 3)) + "," + str(round((value[0])[5], 3)) + "," + str(round((value[0])[6], 3)) + "," + str(round((value[0])[7], 3)) + "," + str(round((value[0])[8], 3)) + "," + str(round((value[0])[9], 3)) + "," + Walsh + "\n" | |
csv.write(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment