Skip to content

Instantly share code, notes, and snippets.

@saarthak24
Created April 5, 2019 08:05
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saarthak24/8ffa73138258257e12002cfca6cb00b6 to your computer and use it in GitHub Desktop.
Save saarthak24/8ffa73138258257e12002cfca6cb00b6 to your computer and use it in GitHub Desktop.
Created while conducting research for project on "Identifying and Ranking Amygdala Genes Linked to Autism Spectrum Disorder" at Children's National Health System
from collections import defaultdict
import numpy as np
import csv
np.set_printoptions(threshold=np.nan)
data = open("Data/GPL5175-3188.txt")
# Data for genes and associated IDs
geneList = data.read().split("\n")
data.close()
geneDict = {}
for i in range(13, 316932):
geneDict[geneList[i].split("\t")[0]] = geneList[i].split("\t")[9]
data = open("Data/GSE25219-GPL5175_series_matrix.txt")
# Detailed data for each gene
geneData = data.read().split("\n")
data.close()
SFARIlist = defaultdict(list)
# Import SFARI gene list
data = open("Data/SFARI.csv")
SFARI = data.read().split("\n")
data.close()
for i in range(1, 281):
dataRow = SFARI[i].split('\t')
SFARIlist[dataRow[0].split(',')[1]].append(dataRow[0].split(',')[(len(dataRow)-4)])
#print(SFARIlist)
#print(len(SFARIlist))
SFARIset = set(SFARIlist)
#print(len(SFARIset))
SFARIKang = defaultdict(list)
for i in SFARIlist:
for key, value in geneDict.items():
# check if value contains substring
if i in value:
SFARIKang[i].append(key)
#print(SFARIKang)
print(len(SFARIKang))
Walshlist = []
# Import Walsh amygdala gene list
data = open("Data/Walsh Proband Amygdala Genes.csv")
Walsh = data.read().split("\n")
data.close()
for i in range(1, 341):
dataRow = Walsh[i].split('\t')
Walshlist.append(dataRow[0].split(',')[4])
#print(len(Walshlist))
Walshset = set(Walshlist)
#print(len(Walshset))
WalshKang = defaultdict(list)
for i in Walshlist:
for key, value in geneDict.items():
# check if value contains substring
if i in value:
WalshKang[i].append(key)
#print(WalshKang)
print(len(WalshKang))
SFARIWalshKang = {**SFARIKang, **WalshKang}
#print(SFARIWalshKang)
print(len(SFARIWalshKang))
# Make matrix of samples with relavant information
sample_titles = geneData[67].replace('"', '').split('\t')
sample_geo_accessions = geneData[61].replace('"', '').split('\t')
sample_organism = geneData[68].replace('"', '').split('\t')
sample_braincode = geneData[69].replace(
'"', '').replace('brain code: ', '').split('\t')
sample_region = geneData[70].replace(
'"', '').replace('region: ', '').split('\t')
sample_hemisphere = geneData[71].replace(
'"', '').replace('hemisphere: ', '').split('\t')
sample_sex = geneData[72].replace('"', '').replace('Sex: ', '').split('\t')
sample_age = geneData[73].replace('"', '').replace('age: ', '').split('\t')
sample_stage = geneData[74].replace('"', '').replace('Stage: ', '').split('\t')
sample_postmortem_interval = geneData[75].replace(
'"', '').replace('postmortem interval: ', '').split('\t')
sample_ph = geneData[76].replace('"', '').replace('ph: ', '').split('\t')
sample_rna_integrity = geneData[77].replace('"', '').replace(
'rna integrity number: ', '').split('\t')
samples = np.column_stack((sample_titles, sample_geo_accessions, sample_organism, sample_braincode, sample_region,
sample_hemisphere, sample_sex, sample_age, sample_stage, sample_postmortem_interval, sample_ph, sample_rna_integrity))
# print(samples)
results = []
rows = []
csv = open('Gene List.csv', "w")
headers = "Gene Name, SFARI Rank [S-3], Period 2, Period 3, Period 4, Period 5, Period 6, Period 7, Period 8, Average Expression, Ratio, Walsh \n"
csv.write(headers)
for i in range(107, 17672): #17672
Gene = key
dataRow = geneData[i].replace('"', '').split('\t')
total = []
P2total = []
P3total = []
P4total = []
P5total = []
P6total = []
P7total = []
P8total = []
for x in range(1, len(dataRow)):
total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 2)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P2total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 3)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P3total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 4)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P4total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 5)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P5total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 6)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P6total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 7)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P7total.append(float(dataRow[x]))
if(samples[x][4] == "AMY" and (int(samples[x][8]) == 8)):
if(float(dataRow[x]) >= 6.0):
results.append(dataRow[0])
P8total.append(float(dataRow[x]))
P2 = 0.0
P3 = 0.0
P4 = 0.0
P5 = 0.0
P6 = 0.0
P7 = 0.0
P8 = 0.0
if(len(P2total) != 0):
P2 = np.sum(P2total)/len(P2total)
if(len(P3total) != 0):
P3 = np.sum(P3total)/len(P3total)
if(len(P4total) != 0):
P4 = np.sum(P4total)/len(P4total)
if(len(P5total) != 0):
P5 = np.sum(P5total)/len(P5total)
if(len(P6total) != 0):
P6 = np.sum(P6total)/len(P6total)
if(len(P7total) != 0):
P7 = np.sum(P7total)/len(P7total)
if(len(P8total) != 0):
P8 = np.sum(P8total)/len(P8total)
amyAvg = (P2 + P3 + P4 + P5 + P6 + P7 + P8)/7.0
totalAvg = np.sum(total)/len(total)
Ratio = amyAvg/totalAvg
if(dataRow[0] in results):
rowTemp = [dataRow[0], P2, P3, P4, P5, P6, P7, P8, amyAvg, Ratio]
rows.append(rowTemp)
print(len(rows))
GenesFiltered = defaultdict(list)
for i in rows:
for key, value in SFARIWalshKang.items():
if(i[0] in value):
GenesFiltered[key].append(i)
#print(GenesFiltered)
#print(len(GenesFiltered))
for key, value in GenesFiltered.items():
SFARI = SFARIlist.get(key, "No")[0]
Walsh = "No"
if(key in Walshlist):
Walsh = "Yes"
row = key + "," + SFARI + "," + str(round((value[0])[1], 3)) + "," + str(round((value[0])[2], 3)) + "," + str(round((value[0])[3], 3)) + "," + str(round((value[0])[4], 3)) + "," + str(round((value[0])[5], 3)) + "," + str(round((value[0])[6], 3)) + "," + str(round((value[0])[7], 3)) + "," + str(round((value[0])[8], 3)) + "," + str(round((value[0])[9], 3)) + "," + Walsh + "\n"
csv.write(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment