Skip to content

Instantly share code, notes, and snippets.

@nievergeltlab
Last active June 29, 2021 19:13
Show Gist options
  • Save nievergeltlab/c2a079343144e39962bb9e0373a22bb5 to your computer and use it in GitHub Desktop.
Save nievergeltlab/c2a079343144e39962bb9e0373a22bb5 to your computer and use it in GitHub Desktop.
Reference subjects may be admixed. To create un-admixed virtual subjects, go over a rolling window, performing unsupervised clustering of subjects. For a given subject, only keep the region windows where they belong to their purported population.
study=prom
for chr in {1..22}
do
plink2 --bfile pts_PROM_mix_am-qc --chr $chr --make-bed --out data/"$study"_$chr
done
K=2
windowsize=5000000
window_increment=5000000
minimum_snps=5 #at least this many snps in window?
for chr in {1..22}
do
chrstart=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g | head -n1)
chrstop=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g -r | head -n1)
windowstart=$chrstart
windowstop=$(($chrstart + windowsize))
#Variables to note end of loop.
reached_limit=0
keep_looping=1
while [ $keep_looping == 1 ]
do
plink2 --bfile data/"$study"_"$chr" --chr $chr --from-bp $windowstart --to-bp $windowstop --make-bed --out "$study"_"$chr"_"$windowstart"_"$windowstop"
#Only do the test if the window is actually containing a decent amount of SNPs
#I need to incorporate something to compare the previous to last to check window size. I think a subtractive join, counting the length of the non-mergeable elements would work.
#that isnt essential, but gets rid of redundant files
nsnps_window=$(wc -l "$study"_"$chr"_"$windowstart"_"$windowstop".bim | awk '{print $1}')
if [ "$nsnps_window" -ge "$minimum_snps" ]
then
admixture_linux-1.3.0/admixture -j6 "$study"_"$chr"_"$windowstart"_"$windowstop".bed 2 #check output code
mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".P admixture_results/.
mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q admixture_results/.
fi
rm -f "$study"_"$chr"_"$windowstart"_"$windowstop".*
windowstart=$(($windowstart+$window_increment))
windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger..
if [ $windowstop -ge $chrstop ]
then
#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
reached_limit=$((reached_limit + 1))
#But otherwise, dont do this again!
if [ $reached_limit -ge 1 ]
then
keep_looping=0
fi
fi
done
done
#Now that we have all of these files, we need to identify, for each segment, the un-admixed individuals.
#It is easy if the average admixture is strongly native, ebcause I just take the column with the higher average.
#If not, I need to match to some kind of reference.
#So this analysis should probably take place in individuals who 'seem' native based on aims clustering.
#After that, I need to take hte individuasl with HIGH native admixture.
ls admixture_results/ | grep "$study"_ | grep .Q > admixture_files.txt
R
cutoff=0.9 #Whatever the cutoff is, it should be greater than the average of the sample to gain an improvement. so a 90% pure sample needs something close to 100%!
#Read fam and ancestry files
d2 <- read.table('pts_PROM_mix_am-qc.fam',header=F,stringsAsFactors=F)
names(d2)[1:2] <- c("FID","IID")
d2$order <- 1:dim(d2)[1]
d3 <- read.table('pts_PROM_mix_am-qc.predpc',header=T,stringsAsFactors=F)
dm <- merge(d2,d3,by=c("FID","IID"),all.x=TRUE)
dm <- dm[order(dm$order),]
filelist <- read.table('admixture_files.txt',stringsAsFactors=F)
for (admix_file in filelist$V1)
{
d1 <- read.table(paste('admixture_results/',admix_file,sep=''))
names(d1) <- c("popa","popb")
admix_dat <- cbind(d1,dm)
#Get % population 1
popa_admix <- mean(admix_dat$popa)
popb_admix <- mean(admix_dat$popb)
#print(popa-popb)
maxpop <- which.max(c(popa_admix,popb_admix))
if(maxpop == 1)
{
natives="popa" #Set to correct column
}
if(maxpop == 2)
{
natives="popb" #Set to correct column
}
#Look across the native column for the really un-admixed people
admix_dat[,natives]
true_natives <- which(admix_dat[,natives] > cutoff)
write.table(admix_dat[true_natives,c("FID","IID")], file=paste('selection_lists/',admix_file,'.subjects',sep=''),col.names=F,row.names=F,quote=F)
}
#Now rewrite the genome of all subjects...
for chr in {1..22}
do
chrstart=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g | head -n1)
chrstop=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g -r | head -n1)
windowstart=$chrstart
windowstop=$(($chrstart + windowsize))
#Variables to note end of loop.
reached_limit=0
keep_looping=1
while [ $keep_looping == 1 ]
do
plink2 --bfile data/"$study"_"$chr" --chr $chr --from-bp $windowstart --to-bp $windowstop --keep selection_lists/"$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q.subjects --make-bed --out reconstituted/"$study"_"$chr"_"$windowstart"_"$windowstop"
windowstart=$(($windowstart+$window_increment))
windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger..
if [ $windowstop -ge $chrstop ]
then
#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
reached_limit=$((reached_limit + 1))
#But otherwise, dont do this again!
if [ $reached_limit -ge 1 ]
then
keep_looping=0
fi
fi
done
done
ls reconstituted | grep .bed | sed 's/.bed//g' | awk '{print "reconstituted/"$1".bed","reconstituted/"$1".bim","reconstituted/"$1".fam"}' > reconstituted.mergelist
plink2 --merge-list reconstituted.mergelist --make-bed --out "$study"_90pct
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment