Skip to content

Instantly share code, notes, and snippets.

@adiamb
Created May 18, 2017 01:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/e2515e4cedb0123b3f91e9b9d3091ae9 to your computer and use it in GitHub Desktop.
Save adiamb/e2515e4cedb0123b3f91e9b9d3091ae9 to your computer and use it in GitHub Desktop.
imputation_cleaining
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
def cleanimpute(x, y, z):
cleaned_info = x[(x['info'] >= 0.8) & (x['certainty'] >= 0.8)]#x[x['certainty'] >= 0.9]
cleaned_impute = y[y[1].isin(cleaned_info.rs_id)]
cleaned_impute.to_csv(z, index=False, header=None)
import re
p =re.compile(r'[.]')
def todos(x, y):
n=0
for line in x:
line1 = line.replace("\n", "")
fields = line1.split(",")
if len(p.findall(fields[1])) is 0:
n += 1
snp = fields[1]
pos = fields[2]
chr = fields[0]
A1 = fields[3]
A2 = fields[4]
out = [chr, snp, pos, A1, A2]
fields1 = np.array(fields[5:], dtype = float)
for i in range(0, len(fields1), 3):
array1 = fields1[i:i+3]
array2 = round(array1[0]*0+array1[1]*1+array1[2]*2, 2)
out.append(str(array2))
#print out
y.write(" ".join(out) + "\n")
print "##################################processed", n, "SNPS from", x, "dosages written to", y
y.close()
x.close()
######################impute2 cleaning ###################################
for s in range(1, 23):
pathtoimpute= '/media/labcomp/HDD3/GWAS/CHR' + str(s) + "_pEUR.impute2"
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".plate_EUR.info"
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0)
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None)
z= "/media/labcomp/HDD3/GWAS/fil_April22_pEUR_CHR" + str(s) + ".impute2"
cleanimpute(x, y, z)
###########################convert to dosage#############################
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_pEUR_CHR' + str(s) + '.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/pEUR_April22_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
todos(x, y)
################################################
################################################
for s in range(1, 23):
pathtoimpute= '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/CHR' + str(s) + "_P35_36_55.impute2"
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".Plate35_36_55.info"
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0)
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None)
y[0] = s
z= "/media/labcomp/HDD3/GWAS/fil_April22_p35_36_55_CHR" + str(s) + ".impute2"
cleanimpute(x, y, z)
import re
p = re.compile(r'[.]')
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_p35_36_55_CHR' + str(s) + '.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/p35_36_55_April22_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
n=0
todos(x, y)
##########################################
for s in range(1, 23):
pathtoimpute= '/media/labcomp/HDD3/GWAS/EAS_clean_CHR' + str(s) + ".impute2"
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".plate_EAS.info"
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0)
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None)
z="/media/labcomp/HDD3/GWAS/fil_April22_pEAS_CHR" + str(s) + ".impute2"
cleanimpute(x, y, z)
import re
p = re.compile(r'[.]')
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_pEAS_CHR' + str(s) + '.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/pEAS_April22_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
n=0
todos(x, y)
##########################################
for s in range(1, 23):
pathtoimpute= '/media/labcomp/HDD3/GWAS/plate86to87_CHR' + str(s) + ".impute2"
pathtoinfo = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/imputeextras/chr' + str(s) + ".p86to87QC.info"
x = pd.read_table(pathtoinfo, delim_whitespace= True, header = 0)
y= pd.read_table(pathtoimpute, delim_whitespace= True, header = None)
z="/media/labcomp/HDD3/GWAS/fil_April22_p86to87_CHR" + str(s) + ".impute2"
cleanimpute(x, y, z)
import re
p = re.compile(r'[.]')
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD3/GWAS/fil_April22_p86to87_CHR' + str(s) + '.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/p86to87_April22_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
n=0
todos(x, y)
###########################################Q17
/media/labcomp/HDD/GWASProjects/japanesedata_GWAS/clean_mar10_CHR" + str(s) + "_Q35.impute2
import re
p = re.compile(r'[.]')
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD/GWASProjects/japanesedata_GWAS/clean_mar10_CHR' + str(s) + '_Q35.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/Q35_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
n=0
todos(x, y)
import re
p = re.compile(r'[.]')
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD/GWASProjects/japanesedata_GWAS/clean_mar10_CHR' + str(s) + '_Q17.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/Q17_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
n=0
todos(x, y)
############################# write annon functions to process impute2 files ##########################
### old function -see above
def cleanimpute(x, y, z):
cleaned_info = x[(x['info'] >= 0.3) & (x['certainty'] >= 0.8)]#x[x['certainty'] >= 0.9]
cleaned_impute = y[y[1].isin(cleaned_info.rs_id)]
cleaned_impute.to_csv(z, index=False, header=None)
##function to perform impute2 to dosage conversion
x = open('/media/labcomp/HDD3/GWAS/test.impute2', 'r')
y= open('/media/labcomp/HDD3/GWAS/test.dos', 'w')
import re
p =re.compile(r'[.]')
def todos(x, y):
n=0
for line in x:
line1 = line.replace("\n", "")
fields = line1.split(",")
if len(p.findall(fields[1])) is 0:
n += 1
snp = fields[1]
pos = fields[2]
chr = fields[0]
A1 = fields[3]
A2 = fields[4]
out = [chr, snp, pos, A1, A2]
fields1 = np.array(fields[5:], dtype = float)
for i in range(0, len(fields1), 3):
array1 = fields1[i:i+3]
array2 = round(array1[0]*0+array1[1]*1+array1[2]*2, 2)
out.append(str(array2))
#print out
y.write(" ".join(out) + "\n")
print "##################################processed", n, "SNPS from", x, "dosages written to", y
y.close()
x.close()
import re
p =re.compile(r'[.]')
for s in range(1, 23):
pathtoimpute = '/media/labcomp/HDD3/GWAS/filter_CHR' + str(s) + '_p86to87.impute2'
pathtoout ='/media/labcomp/HDD3/GWAS/p86to87_CHR' + str(s) + '.dos'
x = open(pathtoimpute, 'r')
y= open(pathtoout, 'w')
todos(x, y)
################################################# add sample IDS to genotype files ################
pref_cols = ['CHR', 'SNP', 'POS', 'A1', 'A2']
for rows in open('/media/labcomp/HDD3/GWAS/p86to87_order.sample', 'r'):
line = rows.replace('\n', '')
line1 = line.split('\t')[1]
pref_cols.append(line1)
out = open('/media/labcomp/HDD3/GWAS/test86to87header', 'w')
out.write(" ".join(pref_cols))
out.close()
########write an annon function to implement
def addcols(x, y):
pref_cols = ['CHR', 'SNP', 'POS', 'A1', 'A2']
for rows in x:
line = rows.replace('\n', '')
line1 = line.split('\t')[1]
pref_cols.append(line1)
y.write(" ".join(pref_cols)+"\n")
y.close()
x.close()
########apply this function over a loop
files = open('/media/labcomp/HDD3/GWAS/filelist_April22', 'r')
for i in files:
plateID = i.replace('\n', '')
plateID1 = plateID.split('_')[0]
print plateID1, plateID
pathtosample = '/media/labcomp/HDD3/GWAS/' + str(plateID)
pathtoout = '/media/labcomp/HDD3/GWAS/' + str(plateID1) + "_colnames"
x= open(pathtosample, 'r')
y=open(pathtoout, 'w')
addcols(x, y)
################add sample names in bash paralell ############################## EMonly
parallel --jobs 8 cat pEUR_colnames pEUR_April22_CHR{}.dos '>' final_dos/pEUR_April22_CHR{}.dos ::: {1..22}
parallel --jobs 8 cat p86to87_colnames p86to87_April22_CHR{}.dos '>' final_dos/p86to87_April22_CHR{}.dos ::: {1..22}
parallel --jobs 8 cat p35_colnames p35_36_55_April22_CHR{}.dos '>' final_dos/p35_36_55_April22_CHR{}.dos ::: {1..22}
#parallel --jobs 8 cat p37_colnames p37_CHR{}.dos '>' final_dos/p37_CHR{}.dos ::: {1..22}
#parallel --jobs 8 cat p38_colnames p37_CHR{}.dos '>' final_dos/p38_CHR{}.dos ::: {1..22}
parallel --jobs 8 cat pEAS_colnames pEAS_April22_CHR{}.dos '>' final_dos/pEAS_April22_CHR{}.dos ::: {1..22}
#parallel --jobs 8 cat p77to85_colnames p77to85_CHR{}.dos '>' final_dos/p77to85_CHR{}.dos ::: {1..22}
#parallel --jobs 8 cat Q17_colnames Q17_CHR{}.dos '>' final_dos/Q17_CHR{}.dos ::: {1..22}
#parallel --jobs 8 cat Q35_colnames Q35_CHR{}.dos '>' final_dos/Q35_CHR{}.dos ::: {1..22}
##################join all files ## to make on dos file for each chr
import pandas as pd
from pandas import Series, DataFrame
filelist =[]
for rows in open('/media/labcomp/HDD3/GWAS/final_dos/filelist_CHR1', 'r'):
filelist.append(rows.replace('\n', ''))
colstodrop=['CHR', 'POS', 'A1', 'A2']
pEUR_CHR1 = pd.read_table(filelist[-3], delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
pEUR_CHR1.drop(colstodrop, axis=1, inplace=True)
p35_36_55_CHR1 = pd.read_table(filelist[0], delim_whitespace=True, header=0, index_col='SNP')
p35_36_55_CHR1.drop(colstodrop, axis=1, inplace=True)
Q17_CHR1 = pd.read_table(filelist[-2], delim_whitespace=True, header=0, index_col='SNP')
Q17_CHR1.drop(colstodrop, axis=1, inplace=True)
Q35_CHR1 = pd.read_table(filelist[-1], delim_whitespace=True, header=0, index_col='SNP')
Q35_CHR1.drop(colstodrop, axis=1, inplace=True)
p86to87_CHR1 = pd.read_table(filelist[4], delim_whitespace=True, header=0, index_col='SNP')
p86to87_CHR1=p86to87_CHR1.join(pEUR_CHR1, how='outer')
p86to87_CHR1=p86to87_CHR1.join(p35_36_55_CHR1, how='outer')
p86to87_CHR1=p86to87_CHR1.join(Q17_CHR1, how='outer')
p86to87_CHR1=p86to87_CHR1.join(Q35_CHR1, how='outer')
for names in filelist:
fileid = names.split('.')[0]
p86to87_CHR1.drop(['A1', 'A2'], axis=1, inplace=True)
p86to87_CHR1.to_csv('/media/labcomp/HDD3/GWAS/final_dos/CHR1_merged.dos', index=True, header =True)
import pandas as pd
import numpy as np
for s in range(1, 23):
a=pd.read_csv('/media/labcomp/HDD3/GWAS/final_dos/nodups_p86to87_CHR'+str(s)+'.dos', header=0, index_col='SNP', low_memory =True)
a['CHR'] = s
b=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEUR_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
c=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p35_36_55_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
d=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q17_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
e=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q35_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
f=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEAS_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
filename ='/media/labcomp/HDD3/GWAS/final_dos/merged_dupsrm_CHR'+str(s)+'.dos'
dosage_merge(a, b, c, d, e, f, filename)
print 'done with chr', str(s), 'written to', filename
def dosage_merge(a, b, c, d, e, f, filename):
colstodrop = ['CHR', 'POS', 'A1', 'A2']
b.drop(colstodrop, axis=1, inplace=True)
b.drop_duplicates(keep='last', inplace=True)
c.drop(colstodrop, axis=1, inplace=True)
c.drop_duplicates(keep='last', inplace=True)
d.drop(colstodrop, axis=1, inplace=True)
d.drop_duplicates(keep='last', inplace=True)
e.drop(colstodrop, axis=1, inplace=True)
e.drop_duplicates(keep='last', inplace=True)
f.drop(colstodrop, axis=1, inplace=True)
f.drop_duplicates(keep='last', inplace=True)
a=a.join(b, how='outer') #pEUR
a=a.join(c, how='outer') #p35_36_55
a=a.join(d, how='outer') #Q17
a=a.join(e, how='outer') #Q35
a=a.join(f, how='outer') #pEAS
a.drop(['A1', 'A2'], axis=1, inplace=True)
a.to_csv(filename, index=True, header=True)
def dosage_merge(a, b, c, d, e, f, filename):
colstodrop = ['CHR', 'POS', 'A1', 'A2']
b.drop(colstodrop, axis=1, inplace=True)
c.drop(colstodrop, axis=1, inplace=True)
d.drop(colstodrop, axis=1, inplace=True)
e.drop(colstodrop, axis=1, inplace=True)
f.drop(colstodrop, axis=1, inplace=True)
f=f.index.drop_duplicates(keep='last')
a=a.join(b, how='outer') #pEUR
a=a.join(c, how='outer') #p35_36_55
a=a.join(d, how='outer') #Q17
a=a.join(e, how='outer') #Q35
a=a.join(f, how='outer') #pEAS
a.drop(['A1', 'A2'], axis=1, inplace=True)
a.to_csv(filename, index=True, header=True)
################dos cleaning of p86to87
chr1 = pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p86to87_CHR1.dos', header=0, delim_whitespace=True)
A1=chr1[chr1['A1'] == '-']
A2=chr1[chr1['A2'] == '-']
A1_A2=pd.concat([A1, A2])
##get all duplicated SNPs
dupchr1=chr1[chr1['SNP'].duplicated(keep=False)]
chr1.drop_duplicates(['SNP'], keep=False, inplace =True)
dupfil=dupchr1[(dupchr1.A1 != '-') & (dupchr1.A2 != '-')]
dupfil2=dupfil[(dupfil.A1 != '0') & (dupfil.A2 != '0')]
total=pd.concat([chr1, dupfil2])
total.drop_duplicates(['SNP'], keep=Last, inplace=True)
##############remove dups and clean
import numpy as np
import pandas as pd
def filterdups(a, output):
dupchr = a[a['SNP'].duplicated(keep=False)]
a.drop_duplicates(['SNP'], keep=False, inplace =True)
dupfil = dupchr[(dupchr.A1 != '-') & (dupchr.A2 != '-')]
dupfil2 = dupfil[(dupfil.A1 != '0') & (dupfil.A2 != '0')]
total=pd.concat([a, dupfil2])
total.drop_duplicates(['SNP'], keep='last', inplace=True)
total.to_csv(output, index=False, header=True)
for s in range(1, 23):
a=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p86to87_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, low_memory =True)
output = '/media/labcomp/HDD3/GWAS/final_dos/nodups_p86to87_April22_CHR'+str(s)+'.dos'
filterdups(a, output)
#analysis of only EM samples, make doasage for
#p86to87
import pandas as pd
import numpy as np
for s in range(1, 23):
a=pd.read_csv('/media/labcomp/HDD3/GWAS/final_dos/nodups_p86to87_April22_CHR'+str(s)+'.dos', header=0, index_col='SNP', low_memory =True)
a['CHR'] = s
b=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEUR_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
c=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/p35_36_55_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
#d=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q17_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
#e=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/Q35_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
d=pd.read_table('/media/labcomp/HDD3/GWAS/final_dos/pEAS_April22_CHR'+str(s)+'.dos', delim_whitespace=True, header=0, index_col='SNP', low_memory =True)
filename ='/media/labcomp/HDD3/GWAS/final_dos/EMplates_April22_CHR'+str(s)+'.dos'
dosage_merge(a, b, c, d, filename)
print 'done with chr', str(s), 'written to', filename
def dosage_merge(a, b, c, d, filename):
colstodrop = ['CHR', 'POS', 'A1', 'A2']
b.drop(colstodrop, axis=1, inplace=True)
b.drop_duplicates(keep='last', inplace=True)
c.drop(colstodrop, axis=1, inplace=True)
c.drop_duplicates(keep='last', inplace=True)
d.drop(colstodrop, axis=1, inplace=True)
d.drop_duplicates(keep='last', inplace=True)
a=a.join(b, how='inner') #pEUR
a=a.join(c, how='inner') #p35_36_55
a=a.join(d, how='inner') #pEAS
a.drop(['A1', 'A2'], axis=1, inplace=True)
a.to_csv(filename, index=True, header=True)
###########################################################################concat all chrs to make one big dosage file################################################
import pandas as pd
doslist = open('EM_April23_doslist', 'r')
dosages=[]
for i in doslist:
dos=i.replace('\n', '')
tempdos = pd.read_csv(dos, index_col='SNP', header=0, low_memory=True, verbose =True)
dosages.append(tempdos)
total_dos = pd.concat(dosages)
### calcualte maf for each snp
import numpy as np
maf =[]
for i in range(0, len(total_dos)):
tempmean= np.array([np.mean(total_dos.ix[i, 2:])/2, 1-(np.mean(total_dos.ix[i, 2:])/2)])
maf.append(np.min(tempmean))
###################################################################asscoiation testing for QTLs#########################################################################
a= open('header_full', 'r')
for line in a:
print line
line=line.replace('\r', '')
line=line.replace('\n', '')
line=line.split('\t')
b = open('full', 'r')
out2=[]
n =0
for line in b:
n += 1
if n >1:
#print line
line = line.replace('\n', '')
line = line.replace('\r', '')
fields = line.split('\t')
#print fields
fields2 = []
for i in fields:
if len(i) is 0:
fields2.append('NA')
else:
fields2.append(i)
#print len(fields2)
out2.append(fields2)
final_out=pd.DataFrame(out2)
final_out.columns = line
fullassoc = pd.read_csv('full', header = 0, usecols=line)
###read in the association files(EM, EM_CSF, EM_SERUm, JP, EM_CS_SERUMRATIO)
EM_CSF= pd.read_csv('TOTAL_EMonly_QTLs.csv', header=0, delim_whitespace=True)
EM_CSF_merge=pd.merge(EM_CSF, final_out, left_on='snps', right_on='SNPS', how='inner')
EM_CSF_merge.sort_values(by='pvalue', ascending=True, inplace=True)
EM_CSF_merge.to_csv('EM_CSF_ASSOCIATION_WITH_DIS_TRAIT.csv', index=False)
EM_SERUM = pd.read_csv('TOTAL_EMonly_Serum_QTLs.csv', header=0)
EM_SERUM_merge=pd.merge(EM_SERUM, final_out, left_on='snps', right_on='SNPS', how='inner')
EM_SERUM_merge.sort_values(by='pvalue', ascending=True, inplace=True)
EM_SERUM_merge.to_csv('EM_SERUM_ASSOCIATION_WITH_DIS_TRAIT.csv', index=False)
EM_ratios = pd.read_csv('TOTAL_ratio_QTLs.csv', header=0)
EM_ratios_merge=pd.merge(EM_ratios, final_out, left_on='snps', right_on='SNPS', how='inner')
EM_ratios_merge.sort_values(by='pvalue', ascending=True, inplace=True)
EM_ratios_merge.to_csv('EM_Ratios_ASSOCIATION_WITH_DIS_TRAIT.csv', index=False)
##########################################################################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from pandas import DataFrame
EM_CSF = pd.read_csv('EM_CSF_QTLs_Apr3.csv', 'r')
fields=[]
x = open('Gene_Clusters.txt', 'r')
for line in x:
line2 = line.replace('\t', ';')
line3 = line2.split('\n')
print line3
fields.append(line3)
for f in fields:
f2 = "".join(f)
print f2
###########################get all hardgenotypes
import pandas as pd
cd /media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/
bimlist = ['HCRT_batch2_AA_3.bim', 'Plate35_36_55.bim', 'EAS_plate.bim', 'EUR_plate.bim']
path = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/'
bims=[]
for i in bimlist:
temp=pd.read_csv(path+i, delim_whitespace=True, header=None)
temp['plateID'] = i
bims.append(temp)
bims[0].plateID = 'Plate86to87'
EM_genotypedsnps = pd.concat(bims)
EM_genotypedsnps=EM_genotypedsnps.rename(columns={0:'CHR', 1:'RSID', 3:'POS', 4:'A1', 5:'A2'})
#p86to87 = HCRT_batch2_AA_3.bim
#pEUR = EUR_plate
#p77to85QC = '/media/labcomp/HDD/GWASProjects/NarcolepsyGWASdata/Plate77to85QC_F.bim'
#p35_55_36 = Plate35_36_55
#pEAS = EAS_plate.bim
require(reshape2)
em_genos = fread('~/Desktop/EM_genotyped_snps.csv', header = T)
agg_em=dcast(data = em_genos, formula = CHR+RSID+POS+A1+A2 ~ plateID, fun.aggregate = length)
agg_em2=dcast(data = em_genos, formula = CHR+RSID+POS ~ plateID, fun.aggregate = length)
write.csv(agg_em2, file = '~/Desktop/EM_genotypes_summarized_April22.csv', row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment