nievergeltlab/derive_virtual.sh

## derive_virtual.sh

study=prom

for chr in {1..22}
do
plink2 --bfile pts_PROM_mix_am-qc --chr $chr --make-bed --out data/"$study"_$chr
done


K=2

windowsize=5000000
window_increment=5000000
minimum_snps=5 #at least this many snps in window?
for chr in {1..22}
do
 chrstart=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g | head -n1)
 chrstop=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g -r | head -n1)

 windowstart=$chrstart
 windowstop=$(($chrstart + windowsize))

 #Variables to note end of loop.
 reached_limit=0
 keep_looping=1

 while [ $keep_looping == 1 ]
 do


  plink2 --bfile data/"$study"_"$chr" --chr $chr  --from-bp $windowstart --to-bp $windowstop --make-bed --out "$study"_"$chr"_"$windowstart"_"$windowstop"

  #Only do the test if the window is actually containing a decent amount of SNPs
  #I need to incorporate something to compare the previous to last to check window size. I think a subtractive join, counting the length of the non-mergeable elements would work.
  #that isnt essential, but gets rid of redundant files

  nsnps_window=$(wc -l "$study"_"$chr"_"$windowstart"_"$windowstop".bim | awk '{print $1}')
  if [ "$nsnps_window" -ge "$minimum_snps" ]
  then
   admixture_linux-1.3.0/admixture -j6 "$study"_"$chr"_"$windowstart"_"$windowstop".bed 2 #check output code
   mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".P  admixture_results/.
   mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q  admixture_results/.

  fi

  rm -f "$study"_"$chr"_"$windowstart"_"$windowstop".*
  windowstart=$(($windowstart+$window_increment))
  windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger..

 if [ $windowstop -ge $chrstop ]
  then
  #Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
     reached_limit=$((reached_limit + 1))
  #But otherwise, dont do this again!
     if [ $reached_limit -ge 1 ]
     then
      keep_looping=0
     fi
 fi


 done
done

#Now that we have all of these files, we need to identify, for each segment, the un-admixed individuals.
#It is easy if the average admixture is strongly native, ebcause I just take the column with the higher average.
#If not, I need to match to some kind of reference.
#So this analysis should probably take place in individuals who 'seem' native based on aims clustering.
#After that, I need to take hte individuasl with HIGH native admixture.

ls admixture_results/ | grep "$study"_ | grep .Q > admixture_files.txt


R
 cutoff=0.9 #Whatever the cutoff is, it should be greater than the average of the sample to gain an improvement. so a 90% pure sample needs something close to 100%!

 #Read fam and ancestry files
 d2 <- read.table('pts_PROM_mix_am-qc.fam',header=F,stringsAsFactors=F)
 names(d2)[1:2] <- c("FID","IID")
 d2$order <- 1:dim(d2)[1]
 d3 <- read.table('pts_PROM_mix_am-qc.predpc',header=T,stringsAsFactors=F)
 dm <- merge(d2,d3,by=c("FID","IID"),all.x=TRUE)
 dm <- dm[order(dm$order),]

filelist <- read.table('admixture_files.txt',stringsAsFactors=F)

for (admix_file in filelist$V1)
{
 d1 <- read.table(paste('admixture_results/',admix_file,sep=''))
 names(d1) <- c("popa","popb")

 admix_dat <- cbind(d1,dm)


 #Get % population 1
 popa_admix <- mean(admix_dat$popa)
 popb_admix <- mean(admix_dat$popb)

 #print(popa-popb)
 maxpop <- which.max(c(popa_admix,popb_admix))

 if(maxpop == 1)
 {
  natives="popa" #Set to correct column
 }
 if(maxpop == 2)
 {
  natives="popb" #Set to correct column
 }

 #Look across the native column for the really un-admixed people
 admix_dat[,natives]

 true_natives <- which(admix_dat[,natives] > cutoff)
 write.table(admix_dat[true_natives,c("FID","IID")], file=paste('selection_lists/',admix_file,'.subjects',sep=''),col.names=F,row.names=F,quote=F)

}

#Now rewrite the genome of all subjects...

for chr in {1..22}
do
 chrstart=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g | head -n1)
 chrstop=$(cat data/"$study"_"$chr".bim | awk '{print $4}' | sort -g -r | head -n1)

 windowstart=$chrstart
 windowstop=$(($chrstart + windowsize))

 #Variables to note end of loop.
 reached_limit=0
 keep_looping=1

 while [ $keep_looping == 1 ]
 do
  plink2 --bfile data/"$study"_"$chr" --chr $chr  --from-bp $windowstart --to-bp $windowstop --keep selection_lists/"$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q.subjects --make-bed --out reconstituted/"$study"_"$chr"_"$windowstart"_"$windowstop"

  windowstart=$(($windowstart+$window_increment))
  windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger..

 if [ $windowstop -ge $chrstop ]
  then
  #Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
     reached_limit=$((reached_limit + 1))
  #But otherwise, dont do this again!
     if [ $reached_limit -ge 1 ]
     then
      keep_looping=0
     fi
 fi

 done
done

   ls reconstituted | grep .bed | sed 's/.bed//g' | awk '{print "reconstituted/"$1".bed","reconstituted/"$1".bim","reconstituted/"$1".fam"}' > reconstituted.mergelist
   plink2 --merge-list reconstituted.mergelist --make-bed --out "$study"_90pct

	study=prom

	for chr in {1..22}
	do
	plink2 --bfile pts_PROM_mix_am-qc --chr $chr --make-bed --out data/"$study"_$chr
	done



	K=2

	windowsize=5000000
	window_increment=5000000
	minimum_snps=5 #at least this many snps in window?
	for chr in {1..22}
	do
	chrstart=$(cat data/"$study"_"$chr".bim \| awk '{print $4}' \| sort -g \| head -n1)
	chrstop=$(cat data/"$study"_"$chr".bim \| awk '{print $4}' \| sort -g -r \| head -n1)

	windowstart=$chrstart
	windowstop=$(($chrstart + windowsize))

	#Variables to note end of loop.
	reached_limit=0
	keep_looping=1

	while [ $keep_looping == 1 ]
	do


	plink2 --bfile data/"$study"_"$chr" --chr $chr --from-bp $windowstart --to-bp $windowstop --make-bed --out "$study"_"$chr"_"$windowstart"_"$windowstop"

	#Only do the test if the window is actually containing a decent amount of SNPs
	#I need to incorporate something to compare the previous to last to check window size. I think a subtractive join, counting the length of the non-mergeable elements would work.
	#that isnt essential, but gets rid of redundant files

	nsnps_window=$(wc -l "$study"_"$chr"_"$windowstart"_"$windowstop".bim \| awk '{print $1}')
	if [ "$nsnps_window" -ge "$minimum_snps" ]
	then
	admixture_linux-1.3.0/admixture -j6 "$study"_"$chr"_"$windowstart"_"$windowstop".bed 2 #check output code
	mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".P admixture_results/.
	mv "$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q admixture_results/.

	fi

	rm -f "$study"_"$chr"_"$windowstart"_"$windowstop".*
	windowstart=$(($windowstart+$window_increment))
	windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger..

	if [ $windowstop -ge $chrstop ]
	then
	#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
	reached_limit=$((reached_limit + 1))
	#But otherwise, dont do this again!
	if [ $reached_limit -ge 1 ]
	then
	keep_looping=0
	fi
	fi



	done
	done

	#Now that we have all of these files, we need to identify, for each segment, the un-admixed individuals.
	#It is easy if the average admixture is strongly native, ebcause I just take the column with the higher average.
	#If not, I need to match to some kind of reference.
	#So this analysis should probably take place in individuals who 'seem' native based on aims clustering.
	#After that, I need to take hte individuasl with HIGH native admixture.

	ls admixture_results/ \| grep "$study"_ \| grep .Q > admixture_files.txt


	R
	cutoff=0.9 #Whatever the cutoff is, it should be greater than the average of the sample to gain an improvement. so a 90% pure sample needs something close to 100%!

	#Read fam and ancestry files
	d2 <- read.table('pts_PROM_mix_am-qc.fam',header=F,stringsAsFactors=F)
	names(d2)[1:2] <- c("FID","IID")
	d2$order <- 1:dim(d2)[1]
	d3 <- read.table('pts_PROM_mix_am-qc.predpc',header=T,stringsAsFactors=F)
	dm <- merge(d2,d3,by=c("FID","IID"),all.x=TRUE)
	dm <- dm[order(dm$order),]

	filelist <- read.table('admixture_files.txt',stringsAsFactors=F)

	for (admix_file in filelist$V1)
	{
	d1 <- read.table(paste('admixture_results/',admix_file,sep=''))
	names(d1) <- c("popa","popb")

	admix_dat <- cbind(d1,dm)


	#Get % population 1
	popa_admix <- mean(admix_dat$popa)
	popb_admix <- mean(admix_dat$popb)

	#print(popa-popb)
	maxpop <- which.max(c(popa_admix,popb_admix))

	if(maxpop == 1)
	{
	natives="popa" #Set to correct column
	}
	if(maxpop == 2)
	{
	natives="popb" #Set to correct column
	}

	#Look across the native column for the really un-admixed people
	admix_dat[,natives]

	true_natives <- which(admix_dat[,natives] > cutoff)
	write.table(admix_dat[true_natives,c("FID","IID")], file=paste('selection_lists/',admix_file,'.subjects',sep=''),col.names=F,row.names=F,quote=F)

	}

	#Now rewrite the genome of all subjects...

	for chr in {1..22}
	do
	chrstart=$(cat data/"$study"_"$chr".bim \| awk '{print $4}' \| sort -g \| head -n1)
	chrstop=$(cat data/"$study"_"$chr".bim \| awk '{print $4}' \| sort -g -r \| head -n1)

	windowstart=$chrstart
	windowstop=$(($chrstart + windowsize))

	#Variables to note end of loop.
	reached_limit=0
	keep_looping=1

	while [ $keep_looping == 1 ]
	do
	plink2 --bfile data/"$study"_"$chr" --chr $chr --from-bp $windowstart --to-bp $windowstop --keep selection_lists/"$study"_"$chr"_"$windowstart"_"$windowstop"."$K".Q.subjects --make-bed --out reconstituted/"$study"_"$chr"_"$windowstart"_"$windowstop"

	windowstart=$(($windowstart+$window_increment))
	windowstop=$(($windowstart + $windowsize)) #put something in here to make sure that the bed file is actually bigger..

	if [ $windowstop -ge $chrstop ]
	then
	#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
	reached_limit=$((reached_limit + 1))
	#But otherwise, dont do this again!
	if [ $reached_limit -ge 1 ]
	then
	keep_looping=0
	fi
	fi

	done
	done

	ls reconstituted \| grep .bed \| sed 's/.bed//g' \| awk '{print "reconstituted/"$1".bed","reconstituted/"$1".bim","reconstituted/"$1".fam"}' > reconstituted.mergelist
	plink2 --merge-list reconstituted.mergelist --make-bed --out "$study"_90pct