Last active
June 29, 2021 19:13
-
-
Save nievergeltlab/c2a079343144e39962bb9e0373a22bb5 to your computer and use it in GitHub Desktop.
Reference subjects may be admixed. To create un-admixed virtual subjects, go over a rolling window, performing unsupervised clustering of subjects. For a given subject, only keep the region windows where they belong to their purported population.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
study=prom | |
for chr in {1..22} | |
do | |
plink2 --bfile pts_PROM_mix_am-qc --chr $chr --make-bed --out data/"$study"_$chr | |
done | |
K=2 | |
windowsize=5000000 | |
window_increment=5000000 | |
minimum_snps=5 #at least this many snps in window? | |
for chr in {1..22} | |
do | |
chrstart=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g | head -n1) | |
chrstop=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g -r | head -n1) | |
windowstart=$chrstart | |
windowstop=$(($chrstart + windowsize)) | |
#Variables to note end of loop. | |
reached_limit=0 | |
keep_looping=1 | |
while [ $keep_looping == 1 ] | |
do | |
plink2 --bfile data/"$study"_"$chr" --chr $chr --from-bp $windowstart --to-bp $windowstop --make-bed --out "$study"_"$chr"_"$windowstart"_"$windowstop" | |
#Only do the test if the window is actually containing a decent amount of SNPs | |
#I need to incorporate something to compare the previous to last to check window size. I think a subtractive join, counting the length of the non-mergeable elements would work. | |
#that isnt essential, but gets rid of redundant files | |
nsnps_window=$(wc -l "$study"_"$chr"_"$windowstart"_"$windowstop".bim | awk '{print $1}') | |
if [ "$nsnps_window" -ge "$minimum_snps" ] | |
then | |
admixture_linux-1.3.0/admixture -j6 "$study"_"$chr"_"$windowstart"_"$windowstop".bed 2 #check output code | |
mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".P admixture_results/. | |
mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q admixture_results/. | |
fi | |
rm -f "$study"_"$chr"_"$windowstart"_"$windowstop".* | |
windowstart=$(($windowstart+$window_increment)) | |
windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger.. | |
if [ $windowstop -ge $chrstop ] | |
then | |
#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet | |
reached_limit=$((reached_limit + 1)) | |
#But otherwise, dont do this again! | |
if [ $reached_limit -ge 1 ] | |
then | |
keep_looping=0 | |
fi | |
fi | |
done | |
done | |
#Now that we have all of these files, we need to identify, for each segment, the un-admixed individuals. | |
#It is easy if the average admixture is strongly native, ebcause I just take the column with the higher average. | |
#If not, I need to match to some kind of reference. | |
#So this analysis should probably take place in individuals who 'seem' native based on aims clustering. | |
#After that, I need to take hte individuasl with HIGH native admixture. | |
ls admixture_results/ | grep "$study"_ | grep .Q > admixture_files.txt | |
R | |
cutoff=0.9 #Whatever the cutoff is, it should be greater than the average of the sample to gain an improvement. so a 90% pure sample needs something close to 100%! | |
#Read fam and ancestry files | |
d2 <- read.table('pts_PROM_mix_am-qc.fam',header=F,stringsAsFactors=F) | |
names(d2)[1:2] <- c("FID","IID") | |
d2$order <- 1:dim(d2)[1] | |
d3 <- read.table('pts_PROM_mix_am-qc.predpc',header=T,stringsAsFactors=F) | |
dm <- merge(d2,d3,by=c("FID","IID"),all.x=TRUE) | |
dm <- dm[order(dm$order),] | |
filelist <- read.table('admixture_files.txt',stringsAsFactors=F) | |
for (admix_file in filelist$V1) | |
{ | |
d1 <- read.table(paste('admixture_results/',admix_file,sep='')) | |
names(d1) <- c("popa","popb") | |
admix_dat <- cbind(d1,dm) | |
#Get % population 1 | |
popa_admix <- mean(admix_dat$popa) | |
popb_admix <- mean(admix_dat$popb) | |
#print(popa-popb) | |
maxpop <- which.max(c(popa_admix,popb_admix)) | |
if(maxpop == 1) | |
{ | |
natives="popa" #Set to correct column | |
} | |
if(maxpop == 2) | |
{ | |
natives="popb" #Set to correct column | |
} | |
#Look across the native column for the really un-admixed people | |
admix_dat[,natives] | |
true_natives <- which(admix_dat[,natives] > cutoff) | |
write.table(admix_dat[true_natives,c("FID","IID")], file=paste('selection_lists/',admix_file,'.subjects',sep=''),col.names=F,row.names=F,quote=F) | |
} | |
#Now rewrite the genome of all subjects... | |
for chr in {1..22} | |
do | |
chrstart=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g | head -n1) | |
chrstop=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g -r | head -n1) | |
windowstart=$chrstart | |
windowstop=$(($chrstart + windowsize)) | |
#Variables to note end of loop. | |
reached_limit=0 | |
keep_looping=1 | |
while [ $keep_looping == 1 ] | |
do | |
plink2 --bfile data/"$study"_"$chr" --chr $chr --from-bp $windowstart --to-bp $windowstop --keep selection_lists/"$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q.subjects --make-bed --out reconstituted/"$study"_"$chr"_"$windowstart"_"$windowstop" | |
windowstart=$(($windowstart+$window_increment)) | |
windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger.. | |
if [ $windowstop -ge $chrstop ] | |
then | |
#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet | |
reached_limit=$((reached_limit + 1)) | |
#But otherwise, dont do this again! | |
if [ $reached_limit -ge 1 ] | |
then | |
keep_looping=0 | |
fi | |
fi | |
done | |
done | |
ls reconstituted | grep .bed | sed 's/.bed//g' | awk '{print "reconstituted/"$1".bed","reconstituted/"$1".bim","reconstituted/"$1".fam"}' > reconstituted.mergelist | |
plink2 --merge-list reconstituted.mergelist --make-bed --out "$study"_90pct | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment