Last active
June 29, 2021 19:11
-
-
Save nievergeltlab/43bfbc650d2b4ab1b4be4b5f03ce22bc to your computer and use it in GitHub Desktop.
Compare allele frequency differences across populations. Then perform a clumping procedure, where the most significantly differentiated markers are kept.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Find ancestry informative markers, using 1000 Genomes reference data as an example | |
#Use global ancestry as a phenotype for association analysis in unrelated KGP subjects | |
#Perform LD based clumping so that in each LD block, the strongest AIMs are selected | |
#Making sure that no indels or ambiguous markers are used | |
#Liftover markers (if you care) | |
#Get subject populations | |
#Extract only AFR and EA samples from the 1000G VCF data, on non monomorphic markers | |
#wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.txt | |
awk '{if ($2=="europe") print $1}' 1000g_pops_wrs.txt > 1000g_europeans.subjects | |
awk '{if ($2=="africa") print $1}' 1000g_pops_wrs.txt > 1000g_africans.subjects | |
#Make EA/A phenotypes | |
cat 1000g_europeans.subjects 1000g_africans.subjects > 1000g_europeans_africans.subjects | |
awk '{if ($2=="europe") print $1,".",1}' 1000g_pops_wrs.txt > 1000g_europeans.pheno | |
awk '{if ($2=="africa") print $1,".",2}' 1000g_pops_wrs.txt > 1000g_africans.pheno | |
cat 1000g_europeans.pheno 1000g_africans.pheno > 1000g_europeans_africans.pheno | |
#Extract each chromosome | |
for i in {1..22} | |
do | |
#plinkseq-0.10/pseq /home/genetics/Desktop/psychchip_aims_panel/kgp_extract_snps/vcfs/ALL.chr"$i".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz write-ped --mask indiv=@1000g_europeans_africans.subjects meta.req=EUR_AF:gt:0.02,EUR_AF:lt:.98,AFR_AF:gt:.02,AFR_AF:lt:.98 --out kgp_eurafr_chr$i | |
#./plink --tfile kgp_eurafr_chr$i --pheno 1000g_europeans_africans.pheno --make-bed --out kgp_eurafr_chr$i | |
#rm -f "kgp_eurafr_chr$i".tped | |
awk '{print $2}' kgp_eurafr_chr$i.bim | uniq -d > kgp_eurafr_chr"$i".triallelic | |
awk '{if ((length($5) > 1) || (length($6) > 1)) print $2}' kgp_eurafr_chr$i.bim > kgp_eurafr_chr"$i".indel | |
grep -P "A\tT" kgp_eurafr_chr"$i".bim | awk '{print $2}' > ambiguous_snps_chr"$i".txt | |
grep -P "T\tA" kgp_eurafr_chr"$i".bim | awk '{print $2}'>> ambiguous_snps_chr"$i".txt | |
grep -P "C\tG" kgp_eurafr_chr"$i".bim | awk '{print $2}'>> ambiguous_snps_chr"$i".txt | |
grep -P "G\tC" kgp_eurafr_chr"$i".bim | awk '{print $2}'>> ambiguous_snps_chr"$i".txt | |
grep "var" kgp_eurafr_chr"$i".bim | awk '{print $2}'>> ambiguous_snps_chr"$i".txt | |
cat kgp_eurafr_chr"$i".triallelic kgp_eurafr_chr"$i".indel ambiguous_snps_chr"$i".txt > remove_snps_chr"$i".txt | |
./plink --bfile kgp_eurafr_chr$i --exclude remove_snps_chr"$i".txt --make-bed --out fkgp_eurafr_chr$i | |
done | |
#Attempt to merge the data | |
ls fkgp_eurafr_chr*.bed | sed 's/.bed//g' | awk '{print $1".bed",$1".bim",$1".fam"}' > kgp_eurafr.mergelist | |
./plink --merge-list kgp_eurafr.mergelist --allow-no-sex --make-bed --out kgp_eurafr_allchr #The merge step is what identifies tri allelics, because tri allelic markers get listed twice | |
#Run relatedness on EA and AA samples to filter out relateds | |
./plink --bfile kgp_eurafr_allchr --maf 0.05 --indep-pairwise 50 5 0.2 --out "kgp_eurafr_allchr"_ibd | |
./plink --bfile kgp_eurafr_allchr --mind 0.02 --extract "kgp_eurafr_allchr"_ibd.prune.in --genome --min 0.2 --out kgp_eurafr_allchr_ibd | |
awk '{if(NR ==1) print "FID","IID"; else print $3,$4}' kgp_eurafr_allchr_ibd.genome > "kgp_eurafr_allchr"_ibd.remove | |
#Run assoc analysis on race | |
./plink --bfile kgp_eurafr_allchr --remove "kgp_eurafr_allchr"_ibd.remove --logistic --allow-no-sex --out kgp_eurafr_allchr_norel_assn | |
./plink --bfile kgp_eurafr_allchr --clump kgp_eurafr_allchr_norel_assn.assoc.logistic --allow-no-sex --remove "kgp_eurafr_allchr"_ibd.remove --clump-p1 5e-7 --clump-p2 5e-7 --clump-r2 .1 --clump-kb 100 --out kgp_eurafr_clump | |
awk 'NR>1{print $3}' kgp_eurafr_clump.clumped | grep -v '^ *$' > kgp_eurafr_clump.snplist | |
#Take random 100k out of this | |
sort --random-sort -k1 kgp_eurafr_clump.snplist | head -n 100000 > kgp_eurafr_clump.snplist.random | |
./plink --bfile kgp_eurafr_allchr --remove "kgp_eurafr_allchr"_ibd.remove --extract kgp_eurafr_clump.snplist.random --freq --allow-no-sex --assoc --out 100k_afs | |
./plink --bfile kgp_eurafr_allchr --remove "kgp_eurafr_allchr"_ibd.remove --extract kgp_eurafr_clump.snplist.random --freq --allow-no-sex --out 100k_afs | |
R | |
d1 <- read.table('kgp_eurafr_clump.snplist',header=F,stringsAsFactors=F,nr=250000) | |
indel <- which(nchar(d1[,5]) > 1 | nchar(d1[,6]) > 1) | |
d2<- d1[-indel,] | |
set.seed(17) | |
keep <- sample(1:dim(d2)[1],100000,replace=F) | |
write.table(d2[keep,2],"100k_afdifmarkers.snplist",quote=F,row.names=F) | |
q("no") | |
#Concatenate all clumps, filter genotype data to them | |
cat kgp_eurafr_clump_chr*.clumped | awk '{print $3}' | grep -v SNP > kgp_eurafr_clump_allchr.clumped | |
./plink --bfile kgp_eurafr_allchr --extract kgp_eurafr_clump_allchr.clumped --make-bed --out kgp_eurafr_allchr_200kclump | |
#Check for liftover - #Every single one is unchaged evidently | |
python liftover.py kgp_eurafr_allchr_200kclump.map > kgp_eurafr_allchr_200kclump.lift | |
#Read in the list and remove the indels, filter to 100k markers | |
R | |
d1 <- read.table('kgp_eurafr_allchr_200kclump.bim',header=F,stringsAsFactors=F,nr=250000) | |
indel <- which(nchar(d1[,5]) > 1 | nchar(d1[,6]) > 1) | |
d2<- d1[-indel,] | |
set.seed(17) | |
keep <- sample(1:dim(d2)[1],100000,replace=F) | |
write.table(d2[keep,2],"100k_afdifmarkers.snplist",quote=F,row.names=F) | |
#Get basic summary data on these markers... | |
./plink --bfile kgp_eurafr_allchr --extract 100k_afdifmarkers.snplist --freq --allow-no-sex --assoc --out 100k_afs | |
#With r2=.2, approx 6% of the 6.5 million variants are clumped... want even less coverage | |
#Update to b144 | |
wget http://genome.sph.umich.edu/wiki/LiftRsNumber.py | |
wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database/organism_data/b144_SNPChrPosOnRef_107.bcp.gz | |
python liftover.py kgp_eurafr_allchr.map | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment