Created
May 18, 2017 01:52
-
-
Save adiamb/e2515e4cedb0123b3f91e9b9d3091ae9 to your computer and use it in GitHub Desktop.
imputation_cleaining
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import re | |
def cleanimpute(x, y, z): | |
cleaned_info = x[(x['info'] >= 0.8) & (x['certainty'] >= 0.8)]#x[x['certainty'] >= 0.9] | |
cleaned_impute = y[y[1].isin(cleaned_info.rs_id)] | |
cleaned_impute.to_csv(z, index=False, header=None) | |
import re | |
p =re.compile(r'[.]') | |
def todos(x, y): | |
n=0 | |
for line in x: | |
line1 = line.replace("\n", "") | |
fields = line1.split(",") | |
if len(p.findall(fields[1])) is 0: | |
n += 1 | |
snp = fields[1] | |
pos = fields[2] | |
chr = fields[0] | |
A1 = fields[3] | |
A2 = fields[4] | |
out = [chr, snp, pos, A1, A2] | |
fields1 = np.array(fields[5:], dtype = float) | |
for i in range(0, len(fields1), 3): | |
array1 = fields1[i:i+3] | |
array2 = round(array1[0]*0+array1[1]*1+array1[2]*2, 2) | |
out.append(str(array2)) | |
#print out | |
y.write(" ".join(out) + "\n") | |
print "##################################processed", n, "SNPS from", x, "dosages written to", y | |
y.close() | |
x.close() | |
######################impute2 cleaning ################################### | |
for s in range(1, 23): | |
pathtoimpute= '/media/labcomp/HDD3/GWAS/CHR' + str(s) + "_pEUR.impute2" | |
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".plate_EUR.info" | |
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0) | |
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None) | |
z= "/media/labcomp/HDD3/GWAS/fil_April22_pEUR_CHR" + str(s) + ".impute2" | |
cleanimpute(x, y, z) | |
###########################convert to dosage############################# | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_pEUR_CHR' + str(s) + '.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/pEUR_April22_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
todos(x, y) | |
################################################ | |
################################################ | |
for s in range(1, 23): | |
pathtoimpute= '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/CHR' + str(s) + "_P35_36_55.impute2" | |
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".Plate35_36_55.info" | |
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0) | |
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None) | |
y[0] = s | |
z= "/media/labcomp/HDD3/GWAS/fil_April22_p35_36_55_CHR" + str(s) + ".impute2" | |
cleanimpute(x, y, z) | |
import re | |
p = re.compile(r'[.]') | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_p35_36_55_CHR' + str(s) + '.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/p35_36_55_April22_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
n=0 | |
todos(x, y) | |
########################################## | |
for s in range(1, 23): | |
pathtoimpute= '/media/labcomp/HDD3/GWAS/EAS_clean_CHR' + str(s) + ".impute2" | |
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".plate_EAS.info" | |
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0) | |
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None) | |
z="/media/labcomp/HDD3/GWAS/fil_April22_pEAS_CHR" + str(s) + ".impute2" | |
cleanimpute(x, y, z) | |
import re | |
p = re.compile(r'[.]') | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_pEAS_CHR' + str(s) + '.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/pEAS_April22_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
n=0 | |
todos(x, y) | |
########################################## | |
for s in range(1, 23): | |
pathtoimpute= '/media/labcomp/HDD3/GWAS/plate86to87_CHR' + str(s) + ".impute2" | |
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".p86to87QC.info" | |
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0) | |
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None) | |
z="/media/labcomp/HDD3/GWAS/fil_April22_p86to87_CHR" + str(s) + ".impute2" | |
cleanimpute(x, y, z) | |
import re | |
p = re.compile(r'[.]') | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_p86to87_CHR' + str(s) + '.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/p86to87_April22_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
n=0 | |
todos(x, y) | |
###########################################Q17 | |
/media/labcomp/HDD/GWASProjects/japanesedata_GWAS/clean_mar10_CHR" + str(s) + "_Q35.impute2 | |
import re | |
p = re.compile(r'[.]') | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD/GWASProjects/japanesedata_GWAS/clean_mar10_CHR' + str(s) + '_Q35.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/Q35_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
n=0 | |
todos(x, y) | |
import re | |
p = re.compile(r'[.]') | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD/GWASProjects/japanesedata_GWAS/clean_mar10_CHR' + str(s) + '_Q17.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/Q17_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
n=0 | |
todos(x, y) | |
############################# write annon functions to process impute2 files ########################## | |
### old function -see above | |
def cleanimpute(x, y, z): | |
cleaned_info = x[(x['info'] >= 0.3) & (x['certainty'] >= 0.8)]#x[x['certainty'] >= 0.9] | |
cleaned_impute = y[y[1].isin(cleaned_info.rs_id)] | |
cleaned_impute.to_csv(z, index=False, header=None) | |
##function to perform impute2 to dosage conversion | |
x = open('/media/labcomp/HDD3/GWAS/test.impute2', 'r') | |
y= open('/media/labcomp/HDD3/GWAS/test.dos', 'w') | |
import re | |
p =re.compile(r'[.]') | |
def todos(x, y): | |
n=0 | |
for line in x: | |
line1 = line.replace("\n", "") | |
fields = line1.split(",") | |
if len(p.findall(fields[1])) is 0: | |
n += 1 | |
snp = fields[1] | |
pos = fields[2] | |
chr = fields[0] | |
A1 = fields[3] | |
A2 = fields[4] | |
out = [chr, snp, pos, A1, A2] | |
fields1 = np.array(fields[5:], dtype = float) | |
for i in range(0, len(fields1), 3): | |
array1 = fields1[i:i+3] | |
array2 = round(array1[0]*0+array1[1]*1+array1[2]*2, 2) | |
out.append(str(array2)) | |
#print out | |
y.write(" ".join(out) + "\n") | |
print "##################################processed", n, "SNPS from", x, "dosages written to", y | |
y.close() | |
x.close() | |
import re | |
p =re.compile(r'[.]') | |
for s in range(1, 23): | |
pathtoimpute = '/media/labcomp/HDD3/GWAS/filter_CHR' + str(s) + '_p86to87.impute2' | |
pathtoout ='/media/labcomp/HDD3/GWAS/p86to87_CHR' + str(s) + '.dos' | |
x = open(pathtoimpute, 'r') | |
y= open(pathtoout, 'w') | |
todos(x, y) | |
################################################# add sample IDS to genotype files ################ | |
pref_cols = ['CHR', 'SNP', 'POS', 'A1', 'A2'] | |
for rows in open('/media/labcomp/HDD3/GWAS/p86to87_order.sample', 'r'): | |
line = rows.replace('\n', '') | |
line1 = line.split('\t')[1] | |
pref_cols.append(line1) | |
out = open('/media/labcomp/HDD3/GWAS/test86to87header', 'w') | |
out.write(" ".join(pref_cols)) | |
out.close() | |
########write an annon function to implement | |
def addcols(x, y): | |
pref_cols = ['CHR', 'SNP', 'POS', 'A1', 'A2'] | |
for rows in x: | |
line = rows.replace('\n', '') | |
line1 = line.split('\t')[1] | |
pref_cols.append(line1) | |
y.write(" ".join(pref_cols)+"\n") | |
y.close() | |
x.close() | |
########apply this function over a loop | |
files = open('/media/labcomp/HDD3/GWAS/filelist_April22', 'r') | |
for i in files: | |
plateID = i.replace('\n', '') | |
plateID1 = plateID.split('_')[0] | |
print plateID1, plateID | |
pathtosample = '/media/labcomp/HDD3/GWAS/' + str(plateID) | |
pathtoout = '/media/labcomp/HDD3/GWAS/' + str(plateID1) + "_colnames" | |
x= open(pathtosample, 'r') | |
y=open(pathtoout, 'w') | |
addcols(x, y) | |
################add sample names in bash paralell ############################## EMonly | |
parallel --jobs 8 cat pEUR_colnames pEUR_April22_CHR{}.dos '>' final_dos/pEUR_April22_CHR{}.dos ::: {1..22} | |
parallel --jobs 8 cat p86to87_colnames p86to87_April22_CHR{}.dos '>' final_dos/p86to87_April22_CHR{}.dos ::: {1..22} | |
parallel --jobs 8 cat p35_colnames p35_36_55_April22_CHR{}.dos '>' final_dos/p35_36_55_April22_CHR{}.dos ::: {1..22} | |
#parallel --jobs 8 cat p37_colnames p37_CHR{}.dos '>' final_dos/p37_CHR{}.dos ::: {1..22} | |
#parallel --jobs 8 cat p38_colnames p37_CHR{}.dos '>' final_dos/p38_CHR{}.dos ::: {1..22} | |
parallel --jobs 8 cat pEAS_colnames pEAS_April22_CHR{}.dos '>' final_dos/pEAS_April22_CHR{}.dos ::: {1..22} | |
#parallel --jobs 8 cat p77to85_colnames p77to85_CHR{}.dos '>' final_dos/p77to85_CHR{}.dos ::: {1..22} | |
#parallel --jobs 8 cat Q17_colnames Q17_CHR{}.dos '>' final_dos/Q17_CHR{}.dos ::: {1..22} | |
#parallel --jobs 8 cat Q35_colnames Q35_CHR{}.dos '>' final_dos/Q35_CHR{}.dos ::: {1..22} | |
##################join all files ## to make on dos file for each chr | |
import pandas as pd | |
from pandas import Series, DataFrame | |
filelist =[] | |
for rows in open('/media/labcomp/HDD3/GWAS/final_dos/filelist_CHR1', 'r'): | |
filelist.append(rows.replace('\n', '')) | |
colstodrop=['CHR', 'POS', 'A1', 'A2'] | |
pEUR_CHR1 = pd.read_table(filelist[-3], delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
pEUR_CHR1.drop(colstodrop, axis=1, inplace=True) | |
p35_36_55_CHR1 = pd.read_table(filelist[0], delim_whitespace=True, header=0, index_col='SNP') | |
p35_36_55_CHR1.drop(colstodrop, axis=1, inplace=True) | |
Q17_CHR1 = pd.read_table(filelist[-2], delim_whitespace=True, header=0, index_col='SNP') | |
Q17_CHR1.drop(colstodrop, axis=1, inplace=True) | |
Q35_CHR1 = pd.read_table(filelist[-1], delim_whitespace=True, header=0, index_col='SNP') | |
Q35_CHR1.drop(colstodrop, axis=1, inplace=True) | |
p86to87_CHR1 = pd.read_table(filelist[4], delim_whitespace=True, header=0, index_col='SNP') | |
p86to87_CHR1=p86to87_CHR1.join(pEUR_CHR1, how='outer') | |
p86to87_CHR1=p86to87_CHR1.join(p35_36_55_CHR1, how='outer') | |
p86to87_CHR1=p86to87_CHR1.join(Q17_CHR1, how='outer') | |
p86to87_CHR1=p86to87_CHR1.join(Q35_CHR1, how='outer') | |
for names in filelist: | |
fileid = names.split('.')[0] | |
p86to87_CHR1.drop(['A1', 'A2'], axis=1, inplace=True) | |
p86to87_CHR1.to_csv('/media/labcomp/HDD3/GWAS/final_dos/CHR1_merged.dos', index=True, header =True) | |
import pandas as pd | |
import numpy as np | |
for s in range(1, 23): | |
a=pd.read_csv('/media/labcomp/HDD3/GWAS/final_dos/nodups_p86to87_CHR'+str(s)+'.dos', header=0, index_col='SNP', low_memory =True) | |
a['CHR'] = s | |
b=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEUR_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
c=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p35_36_55_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
d=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q17_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
e=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q35_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
f=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEAS_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
filename ='/media/labcomp/HDD3/GWAS/final_dos/merged_dupsrm_CHR'+str(s)+'.dos' | |
dosage_merge(a, b, c, d, e, f, filename) | |
print 'done with chr', str(s), 'written to', filename | |
def dosage_merge(a, b, c, d, e, f, filename): | |
colstodrop = ['CHR', 'POS', 'A1', 'A2'] | |
b.drop(colstodrop, axis=1, inplace=True) | |
b.drop_duplicates(keep='last', inplace=True) | |
c.drop(colstodrop, axis=1, inplace=True) | |
c.drop_duplicates(keep='last', inplace=True) | |
d.drop(colstodrop, axis=1, inplace=True) | |
d.drop_duplicates(keep='last', inplace=True) | |
e.drop(colstodrop, axis=1, inplace=True) | |
e.drop_duplicates(keep='last', inplace=True) | |
f.drop(colstodrop, axis=1, inplace=True) | |
f.drop_duplicates(keep='last', inplace=True) | |
a=a.join(b, how='outer') #pEUR | |
a=a.join(c, how='outer') #p35_36_55 | |
a=a.join(d, how='outer') #Q17 | |
a=a.join(e, how='outer') #Q35 | |
a=a.join(f, how='outer') #pEAS | |
a.drop(['A1', 'A2'], axis=1, inplace=True) | |
a.to_csv(filename, index=True, header=True) | |
def dosage_merge(a, b, c, d, e, f, filename): | |
colstodrop = ['CHR', 'POS', 'A1', 'A2'] | |
b.drop(colstodrop, axis=1, inplace=True) | |
c.drop(colstodrop, axis=1, inplace=True) | |
d.drop(colstodrop, axis=1, inplace=True) | |
e.drop(colstodrop, axis=1, inplace=True) | |
f.drop(colstodrop, axis=1, inplace=True) | |
f=f.index.drop_duplicates(keep='last') | |
a=a.join(b, how='outer') #pEUR | |
a=a.join(c, how='outer') #p35_36_55 | |
a=a.join(d, how='outer') #Q17 | |
a=a.join(e, how='outer') #Q35 | |
a=a.join(f, how='outer') #pEAS | |
a.drop(['A1', 'A2'], axis=1, inplace=True) | |
a.to_csv(filename, index=True, header=True) | |
################dos cleaning of p86to87 | |
chr1 = pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p86to87_CHR1.dos', header=0, delim_whitespace=True) | |
A1=chr1[chr1['A1'] == '-'] | |
A2=chr1[chr1['A2'] == '-'] | |
A1_A2=pd.concat([A1, A2]) | |
##get all duplicated SNPs | |
dupchr1=chr1[chr1['SNP'].duplicated(keep=False)] | |
chr1.drop_duplicates(['SNP'], keep=False, inplace =True) | |
dupfil=dupchr1[(dupchr1.A1 != '-') & (dupchr1.A2 != '-')] | |
dupfil2=dupfil[(dupfil.A1 != '0') & (dupfil.A2 != '0')] | |
total=pd.concat([chr1, dupfil2]) | |
total.drop_duplicates(['SNP'], keep=Last, inplace=True) | |
##############remove dups and clean | |
import numpy as np | |
import pandas as pd | |
def filterdups(a, output): | |
dupchr = a[a['SNP'].duplicated(keep=False)] | |
a.drop_duplicates(['SNP'], keep=False, inplace =True) | |
dupfil = dupchr[(dupchr.A1 != '-') & (dupchr.A2 != '-')] | |
dupfil2 = dupfil[(dupfil.A1 != '0') & (dupfil.A2 != '0')] | |
total=pd.concat([a, dupfil2]) | |
total.drop_duplicates(['SNP'], keep='last', inplace=True) | |
total.to_csv(output, index=False, header=True) | |
for s in range(1, 23): | |
a=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p86to87_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, low_memory =True) | |
output = '/media/labcomp/HDD3/GWAS/final_dos/nodups_p86to87_April22_CHR'+str(s)+'.dos' | |
filterdups(a, output) | |
#analysis of only EM samples, make doasage for | |
#p86to87 | |
import pandas as pd | |
import numpy as np | |
for s in range(1, 23): | |
a=pd.read_csv('/media/labcomp/HDD3/GWAS/final_dos/nodups_p86to87_April22_CHR'+str(s)+'.dos', header=0, index_col='SNP', low_memory =True) | |
a['CHR'] = s | |
b=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEUR_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
c=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p35_36_55_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
#d=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q17_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
#e=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q35_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
d=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEAS_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True) | |
filename ='/media/labcomp/HDD3/GWAS/final_dos/EMplates_April22_CHR'+str(s)+'.dos' | |
dosage_merge(a, b, c, d, filename) | |
print 'done with chr', str(s), 'written to', filename | |
def dosage_merge(a, b, c, d, filename): | |
colstodrop = ['CHR', 'POS', 'A1', 'A2'] | |
b.drop(colstodrop, axis=1, inplace=True) | |
b.drop_duplicates(keep='last', inplace=True) | |
c.drop(colstodrop, axis=1, inplace=True) | |
c.drop_duplicates(keep='last', inplace=True) | |
d.drop(colstodrop, axis=1, inplace=True) | |
d.drop_duplicates(keep='last', inplace=True) | |
a=a.join(b, how='inner') #pEUR | |
a=a.join(c, how='inner') #p35_36_55 | |
a=a.join(d, how='inner') #pEAS | |
a.drop(['A1', 'A2'], axis=1, inplace=True) | |
a.to_csv(filename, index=True, header=True) | |
###########################################################################concat all chrs to make one big dosage file################################################ | |
import pandas as pd | |
doslist = open('EM_April23_doslist', 'r') | |
dosages=[] | |
for i in doslist: | |
dos=i.replace('\n', '') | |
tempdos = pd.read_csv(dos, index_col='SNP', header=0, low_memory=True, verbose =True) | |
dosages.append(tempdos) | |
total_dos = pd.concat(dosages) | |
### calcualte maf for each snp | |
import numpy as np | |
maf =[] | |
for i in range(0, len(total_dos)): | |
tempmean= np.array([np.mean(total_dos.ix[i, 2:])/2, 1-(np.mean(total_dos.ix[i, 2:])/2)]) | |
maf.append(np.min(tempmean)) | |
###################################################################asscoiation testing for QTLs######################################################################### | |
a= open('header_full', 'r') | |
for line in a: | |
print line | |
line=line.replace('\r', '') | |
line=line.replace('\n', '') | |
line=line.split('\t') | |
b = open('full', 'r') | |
out2=[] | |
n =0 | |
for line in b: | |
n += 1 | |
if n >1: | |
#print line | |
line = line.replace('\n', '') | |
line = line.replace('\r', '') | |
fields = line.split('\t') | |
#print fields | |
fields2 = [] | |
for i in fields: | |
if len(i) is 0: | |
fields2.append('NA') | |
else: | |
fields2.append(i) | |
#print len(fields2) | |
out2.append(fields2) | |
final_out=pd.DataFrame(out2) | |
final_out.columns = line | |
fullassoc = pd.read_csv('full', header = 0, usecols=line) | |
###read in the association files(EM, EM_CSF, EM_SERUm, JP, EM_CS_SERUMRATIO) | |
EM_CSF= pd.read_csv('TOTAL_EMonly_QTLs.csv', header=0, delim_whitespace=True) | |
EM_CSF_merge=pd.merge(EM_CSF, final_out, left_on='snps', right_on='SNPS', how='inner') | |
EM_CSF_merge.sort_values(by='pvalue', ascending=True, inplace=True) | |
EM_CSF_merge.to_csv('EM_CSF_ASSOCIATION_WITH_DIS_TRAIT.csv', index=False) | |
EM_SERUM = pd.read_csv('TOTAL_EMonly_Serum_QTLs.csv', header=0) | |
EM_SERUM_merge=pd.merge(EM_SERUM, final_out, left_on='snps', right_on='SNPS', how='inner') | |
EM_SERUM_merge.sort_values(by='pvalue', ascending=True, inplace=True) | |
EM_SERUM_merge.to_csv('EM_SERUM_ASSOCIATION_WITH_DIS_TRAIT.csv', index=False) | |
EM_ratios = pd.read_csv('TOTAL_ratio_QTLs.csv', header=0) | |
EM_ratios_merge=pd.merge(EM_ratios, final_out, left_on='snps', right_on='SNPS', how='inner') | |
EM_ratios_merge.sort_values(by='pvalue', ascending=True, inplace=True) | |
EM_ratios_merge.to_csv('EM_Ratios_ASSOCIATION_WITH_DIS_TRAIT.csv', index=False) | |
########################################################################################## | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import re | |
from pandas import DataFrame | |
EM_CSF = pd.read_csv('EM_CSF_QTLs_Apr3.csv', 'r') | |
fields=[] | |
x = open('Gene_Clusters.txt', 'r') | |
for line in x: | |
line2 = line.replace('\t', ';') | |
line3 = line2.split('\n') | |
print line3 | |
fields.append(line3) | |
for f in fields: | |
f2 = "".join(f) | |
print f2 | |
###########################get all hardgenotypes | |
import pandas as pd | |
cd /media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/ | |
bimlist = ['HCRT_batch2_AA_3.bim', 'Plate35_36_55.bim', 'EAS_plate.bim', 'EUR_plate.bim'] | |
path = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/' | |
bims=[] | |
for i in bimlist: | |
temp=pd.read_csv(path+i, delim_whitespace=True, header=None) | |
temp['plateID'] = i | |
bims.append(temp) | |
bims[0].plateID = 'Plate86to87' | |
EM_genotypedsnps = pd.concat(bims) | |
EM_genotypedsnps=EM_genotypedsnps.rename(columns={0:'CHR', 1:'RSID', 3:'POS', 4:'A1', 5:'A2'}) | |
#p86to87 = HCRT_batch2_AA_3.bim | |
#pEUR = EUR_plate | |
#p77to85QC = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/Plate77to85QC_F.bim' | |
#p35_55_36 = Plate35_36_55 | |
#pEAS = EAS_plate.bim | |
require(reshape2) | |
em_genos = fread('~/Desktop/EM_genotyped_snps.csv', header = T) | |
agg_em=dcast(data = em_genos, formula = CHR+RSID+POS+A1+A2 ~ plateID, fun.aggregate = length) | |
agg_em2=dcast(data = em_genos, formula = CHR+RSID+POS ~ plateID, fun.aggregate = length) | |
write.csv(agg_em2, file = '~/Desktop/EM_genotypes_summarized_April22.csv', row.names = F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment