nievergeltlab/lanc.r

## lanc.r
#Do a kcolumn split of local ancestry
#Do a delimiting of | then kcolumn split of haplotypes vcf

 zcat gtpc_phased_chr22.vcf.gz | tail -n+11 | sed 's/|/\t/g' > gtpc_phased_chr22.vcf.haps

 zcat gtpc_phased_chr22.vcf.gz  | head -n10 | tail -n1   | sed 's/#//g' | awk '{$1=$1; print}'  > gtpc_phased_chr22.vcf.haps.header

#Notice that the odd/even selection REMOVES the columns, rather than selecting them, hence the file names are haps1 and haps0
awk '{ for (i=10;i<=NF;i+=2)  $i="" } 1' gtpc_phased_chr22.vcf.haps | cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps1
awk '{ for (i=11;i<=NF;i+=2)  $i="" } 1' gtpc_phased_chr22.vcf.haps | cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps0


#In R, the easiest thing to do will be to append the trailing .0 and .1 on subject names to account for the haps names.
#note I use sed to change 'n snps' coluimn name to get rid of the dumb space delimiter
 awk '{ for (i=7;i<=NF;i+=2) $i=""  } 1' GTP.rfmix.chr22.msp.tsv | sed 's/n snps/nsnps/g' | sed 's/#//g'  | tail -n+1 > GTP.rfmix.chr22.msp.tsv.lanc1
 awk '{ for (i=8;i<=NF;i+=2) $i=""  } 1' GTP.rfmix.chr22.msp.tsv | sed 's/n snps/nsnps/g' | sed 's/#//g' | tail -n+1 > GTP.rfmix.chr22.msp.tsv.lanc0

#-f$(seq -s, 2 2 10)

R
 library(data.table)
 library(wrapr)
 library(speedglm)
 #Load covariate data sheets
  fam <- fread('gtpc_bg_cogb.fam',data.table=F)
  names(fam) <- c("FID","IID","M","F","G","P")
  covariate <- fread('pts_gtpc_mix_am-qc-aam_pca.menv.mds_cov',data.table=F)
  covs <- merge(fam,covariate,by=c("FID","IID"))

 #Load haps
  haps0 <- fread('gtpc_phased_chr22.vcf.haps0',data.table=F)
  haps1 <- fread('gtpc_phased_chr22.vcf.haps1',data.table=F)
 #append the .0 to the hap names to match the lanc files
  names(haps0)[-c(1:9)] <- paste(names(haps0)[-c(1:9)],".0",sep='')
  names(haps1)[-c(1:9)] <- paste(names(haps1)[-c(1:9)],".1",sep='')


  row.names(haps0) <- haps0$ID
  row.names(haps1) <- haps1$ID

 #make this just a matrix for processing speed!
  haps0a <- as.matrix(haps0[,-c(1:9)])
  haps1a <- as.matrix(haps1[,-c(1:9)])

  #can't remove unaltered haps matrix yet to save ram because the position is utilized in intervals. save as a variable to allow for memory saving

 #Load LANC
  lanc0 <- fread('GTP.rfmix.chr22.msp.tsv.lanc0',data.table=F)
  lanc1 <- fread('GTP.rfmix.chr22.msp.tsv.lanc1',data.table=F)

 #Convert local ancestry to sign values for calculating number of effect alleles. For greater than 2 ancestries, just add more equivalences (e.g. ..._anc2 == 2, ..._anc3 ==3 and so on)

  eqcheck <- function(x,eqval=0)
  {
   as.numeric(x == eqval)
  }

  lanc0_anc0 <- as.matrix(apply(lanc0[,c(7:ncol(lanc0))],c(1,2),eqcheck,eqval=0))
  lanc1_anc0 <- as.matrix(apply(lanc1[,c(7:ncol(lanc1))],c(1,2),eqcheck,eqval=0))

  lanc0_anc1 <- as.matrix(apply(lanc0[,c(7:ncol(lanc0))],c(1,2),eqcheck,eqval=1))
  lanc1_anc1 <- as.matrix(apply(lanc1[,c(7:ncol(lanc1))],c(1,2),eqcheck,eqval=1))

  #


 #Expand the local ancestry matrices so they conform to dimension of haplotypes (i.e. assign local ancestry to every SNP)
  intervals <- findInterval(haps0$POS, lanc0$epos,rightmost.closed=TRUE) + 1 #The +1 is because intervals bins start from 0. Also note how the right intervals is closed to account for end of chromsoome SNPS, which would otherwise be assigned to a non-existing interval..

 #The haps matrices are identical in dimension by construction. So this interval finding only needs to be done for one matrix.
 #I do it for both just in case for testing purposes
  lanc0_anc0_expanded <- lanc0_anc0[intervals,]
  lanc1_anc0_expanded <- lanc1_anc0[intervals,]

  lanc0_anc1_expanded <- lanc0_anc1[intervals,]
  lanc1_anc1_expanded <- lanc1_anc1[intervals,]

  save(lanc0_anc0_expanded,lanc1_anc0_expanded,file="GTP.rfmix.chr22.msp.tsv.lanc0.anc0.R",quote=F,row.names=F)
  save(lanc0_anc1_expanded,lanc1_anc1_expanded,file="GTP.rfmix.chr22.msp.tsv.lanc.anc1.R",quote=F,row.names=F)
  save(haps0a,haps1a,file="GTP.rfmix.chr22.msp.tsv.haps.R",quote=F,row.names=F)


#Currently only coded for 0/1, but expansible to 3...

#at this point I should switch the data format to matrices to speed things up!

#N copies of risk allele for ancestry 0
 hap0_anc0 <- lanc0_anc0_expanded * haps0a
 hap1_anc0 <- lanc1_anc0_expanded * haps1a

 anc0_ncopies <- hap0_anc0 + hap1_anc0

#N copies of risk allele for ancestry 1
 hap0_anc1 <- lanc0_anc1_expanded * haps0[,c(10:ncol(haps0))]
 hap1_anc1 <- lanc1_anc1_expanded * haps1[,c(10:ncol(haps0))]

 anc1_ncopies <- hap0_anc1 + hap1_anc1

#And so on for N groups > 2...

#Now have matrices where each row is a SNP and each column is a subject. Need to transpose to make this usable.

 anc0_ncopies_t <- t(anc0_ncopies)
 anc1_ncopies_t <- t(anc1_ncopies)

#Now each row is a subject and column is a SNP. this can now be mated with the phenotype/covariate file
 p <- match_order(d2$idx, d1$idx)


#Then just apply over all SNPs with a common formula
	#Do a kcolumn split of local ancestry
	#Do a delimiting of \| then kcolumn split of haplotypes vcf

	zcat gtpc_phased_chr22.vcf.gz \| tail -n+11 \| sed 's/\|/\t/g' > gtpc_phased_chr22.vcf.haps

	zcat gtpc_phased_chr22.vcf.gz \| head -n10 \| tail -n1 \| sed 's/#//g' \| awk '{$1=$1; print}' > gtpc_phased_chr22.vcf.haps.header

	#Notice that the odd/even selection REMOVES the columns, rather than selecting them, hence the file names are haps1 and haps0
	awk '{ for (i=10;i<=NF;i+=2) $i="" } 1' gtpc_phased_chr22.vcf.haps \| cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps1
	awk '{ for (i=11;i<=NF;i+=2) $i="" } 1' gtpc_phased_chr22.vcf.haps \| cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps0


	#In R, the easiest thing to do will be to append the trailing .0 and .1 on subject names to account for the haps names.
	#note I use sed to change 'n snps' coluimn name to get rid of the dumb space delimiter
	awk '{ for (i=7;i<=NF;i+=2) $i="" } 1' GTP.rfmix.chr22.msp.tsv \| sed 's/n snps/nsnps/g' \| sed 's/#//g' \| tail -n+1 > GTP.rfmix.chr22.msp.tsv.lanc1
	awk '{ for (i=8;i<=NF;i+=2) $i="" } 1' GTP.rfmix.chr22.msp.tsv \| sed 's/n snps/nsnps/g' \| sed 's/#//g' \| tail -n+1 > GTP.rfmix.chr22.msp.tsv.lanc0

	#-f$(seq -s, 2 2 10)

	R
	library(data.table)
	library(wrapr)
	library(speedglm)
	#Load covariate data sheets
	fam <- fread('gtpc_bg_cogb.fam',data.table=F)
	names(fam) <- c("FID","IID","M","F","G","P")
	covariate <- fread('pts_gtpc_mix_am-qc-aam_pca.menv.mds_cov',data.table=F)
	covs <- merge(fam,covariate,by=c("FID","IID"))

	#Load haps
	haps0 <- fread('gtpc_phased_chr22.vcf.haps0',data.table=F)
	haps1 <- fread('gtpc_phased_chr22.vcf.haps1',data.table=F)
	#append the .0 to the hap names to match the lanc files
	names(haps0)[-c(1:9)] <- paste(names(haps0)[-c(1:9)],".0",sep='')
	names(haps1)[-c(1:9)] <- paste(names(haps1)[-c(1:9)],".1",sep='')


	row.names(haps0) <- haps0$ID
	row.names(haps1) <- haps1$ID

	#make this just a matrix for processing speed!
	haps0a <- as.matrix(haps0[,-c(1:9)])
	haps1a <- as.matrix(haps1[,-c(1:9)])

	#can't remove unaltered haps matrix yet to save ram because the position is utilized in intervals. save as a variable to allow for memory saving

	#Load LANC
	lanc0 <- fread('GTP.rfmix.chr22.msp.tsv.lanc0',data.table=F)
	lanc1 <- fread('GTP.rfmix.chr22.msp.tsv.lanc1',data.table=F)

	#Convert local ancestry to sign values for calculating number of effect alleles. For greater than 2 ancestries, just add more equivalences (e.g. ..._anc2 == 2, ..._anc3 ==3 and so on)

	eqcheck <- function(x,eqval=0)
	{
	as.numeric(x == eqval)
	}

	lanc0_anc0 <- as.matrix(apply(lanc0[,c(7:ncol(lanc0))],c(1,2),eqcheck,eqval=0))
	lanc1_anc0 <- as.matrix(apply(lanc1[,c(7:ncol(lanc1))],c(1,2),eqcheck,eqval=0))

	lanc0_anc1 <- as.matrix(apply(lanc0[,c(7:ncol(lanc0))],c(1,2),eqcheck,eqval=1))
	lanc1_anc1 <- as.matrix(apply(lanc1[,c(7:ncol(lanc1))],c(1,2),eqcheck,eqval=1))

	#


	#Expand the local ancestry matrices so they conform to dimension of haplotypes (i.e. assign local ancestry to every SNP)
	intervals <- findInterval(haps0$POS, lanc0$epos,rightmost.closed=TRUE) + 1 #The +1 is because intervals bins start from 0. Also note how the right intervals is closed to account for end of chromsoome SNPS, which would otherwise be assigned to a non-existing interval..

	#The haps matrices are identical in dimension by construction. So this interval finding only needs to be done for one matrix.
	#I do it for both just in case for testing purposes
	lanc0_anc0_expanded <- lanc0_anc0[intervals,]
	lanc1_anc0_expanded <- lanc1_anc0[intervals,]

	lanc0_anc1_expanded <- lanc0_anc1[intervals,]
	lanc1_anc1_expanded <- lanc1_anc1[intervals,]

	save(lanc0_anc0_expanded,lanc1_anc0_expanded,file="GTP.rfmix.chr22.msp.tsv.lanc0.anc0.R",quote=F,row.names=F)
	save(lanc0_anc1_expanded,lanc1_anc1_expanded,file="GTP.rfmix.chr22.msp.tsv.lanc.anc1.R",quote=F,row.names=F)
	save(haps0a,haps1a,file="GTP.rfmix.chr22.msp.tsv.haps.R",quote=F,row.names=F)


	#Currently only coded for 0/1, but expansible to 3...

	#at this point I should switch the data format to matrices to speed things up!

	#N copies of risk allele for ancestry 0
	hap0_anc0 <- lanc0_anc0_expanded * haps0a
	hap1_anc0 <- lanc1_anc0_expanded * haps1a

	anc0_ncopies <- hap0_anc0 + hap1_anc0

	#N copies of risk allele for ancestry 1
	hap0_anc1 <- lanc0_anc1_expanded * haps0[,c(10:ncol(haps0))]
	hap1_anc1 <- lanc1_anc1_expanded * haps1[,c(10:ncol(haps0))]

	anc1_ncopies <- hap0_anc1 + hap1_anc1

	#And so on for N groups > 2...

	#Now have matrices where each row is a SNP and each column is a subject. Need to transpose to make this usable.

	anc0_ncopies_t <- t(anc0_ncopies)
	anc1_ncopies_t <- t(anc1_ncopies)

	#Now each row is a subject and column is a SNP. this can now be mated with the phenotype/covariate file
	p <- match_order(d2$idx, d1$idx)


	#Then just apply over all SNPs with a common formula