Skip to content

Instantly share code, notes, and snippets.

@nievergeltlab
Created October 16, 2020 20:56
Show Gist options
  • Save nievergeltlab/9454b13b9be3a1d15bcf7a12fd1bddba to your computer and use it in GitHub Desktop.
Save nievergeltlab/9454b13b9be3a1d15bcf7a12fd1bddba to your computer and use it in GitHub Desktop.
Demonstrates how RFmix outputs can be expanded togenome wide, local ancestry coded to be used in analysis.
#Do a kcolumn split of local ancestry
#Do a delimiting of | then kcolumn split of haplotypes vcf
zcat gtpc_phased_chr22.vcf.gz | tail -n+11 | sed 's/|/\t/g' > gtpc_phased_chr22.vcf.haps
zcat gtpc_phased_chr22.vcf.gz | head -n10 | tail -n1 | sed 's/#//g' | awk '{$1=$1; print}' > gtpc_phased_chr22.vcf.haps.header
#Notice that the odd/even selection REMOVES the columns, rather than selecting them, hence the file names are haps1 and haps0
awk '{ for (i=10;i<=NF;i+=2) $i="" } 1' gtpc_phased_chr22.vcf.haps | cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps1
awk '{ for (i=11;i<=NF;i+=2) $i="" } 1' gtpc_phased_chr22.vcf.haps | cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps0
#In R, the easiest thing to do will be to append the trailing .0 and .1 on subject names to account for the haps names.
#note I use sed to change 'n snps' coluimn name to get rid of the dumb space delimiter
awk '{ for (i=7;i<=NF;i+=2) $i="" } 1' GTP.rfmix.chr22.msp.tsv | sed 's/n snps/nsnps/g' | sed 's/#//g' | tail -n+1 > GTP.rfmix.chr22.msp.tsv.lanc1
awk '{ for (i=8;i<=NF;i+=2) $i="" } 1' GTP.rfmix.chr22.msp.tsv | sed 's/n snps/nsnps/g' | sed 's/#//g' | tail -n+1 > GTP.rfmix.chr22.msp.tsv.lanc0
#-f$(seq -s, 2 2 10)
R
library(data.table)
library(wrapr)
library(speedglm)
#Load covariate data sheets
fam <- fread('gtpc_bg_cogb.fam',data.table=F)
names(fam) <- c("FID","IID","M","F","G","P")
covariate <- fread('pts_gtpc_mix_am-qc-aam_pca.menv.mds_cov',data.table=F)
covs <- merge(fam,covariate,by=c("FID","IID"))
#Load haps
haps0 <- fread('gtpc_phased_chr22.vcf.haps0',data.table=F)
haps1 <- fread('gtpc_phased_chr22.vcf.haps1',data.table=F)
#append the .0 to the hap names to match the lanc files
names(haps0)[-c(1:9)] <- paste(names(haps0)[-c(1:9)],".0",sep='')
names(haps1)[-c(1:9)] <- paste(names(haps1)[-c(1:9)],".1",sep='')
row.names(haps0) <- haps0$ID
row.names(haps1) <- haps1$ID
#make this just a matrix for processing speed!
haps0a <- as.matrix(haps0[,-c(1:9)])
haps1a <- as.matrix(haps1[,-c(1:9)])
#can't remove unaltered haps matrix yet to save ram because the position is utilized in intervals. save as a variable to allow for memory saving
#Load LANC
lanc0 <- fread('GTP.rfmix.chr22.msp.tsv.lanc0',data.table=F)
lanc1 <- fread('GTP.rfmix.chr22.msp.tsv.lanc1',data.table=F)
#Convert local ancestry to sign values for calculating number of effect alleles. For greater than 2 ancestries, just add more equivalences (e.g. ..._anc2 == 2, ..._anc3 ==3 and so on)
eqcheck <- function(x,eqval=0)
{
as.numeric(x == eqval)
}
lanc0_anc0 <- as.matrix(apply(lanc0[,c(7:ncol(lanc0))],c(1,2),eqcheck,eqval=0))
lanc1_anc0 <- as.matrix(apply(lanc1[,c(7:ncol(lanc1))],c(1,2),eqcheck,eqval=0))
lanc0_anc1 <- as.matrix(apply(lanc0[,c(7:ncol(lanc0))],c(1,2),eqcheck,eqval=1))
lanc1_anc1 <- as.matrix(apply(lanc1[,c(7:ncol(lanc1))],c(1,2),eqcheck,eqval=1))
#
#Expand the local ancestry matrices so they conform to dimension of haplotypes (i.e. assign local ancestry to every SNP)
intervals <- findInterval(haps0$POS, lanc0$epos,rightmost.closed=TRUE) + 1 #The +1 is because intervals bins start from 0. Also note how the right intervals is closed to account for end of chromsoome SNPS, which would otherwise be assigned to a non-existing interval..
#The haps matrices are identical in dimension by construction. So this interval finding only needs to be done for one matrix.
#I do it for both just in case for testing purposes
lanc0_anc0_expanded <- lanc0_anc0[intervals,]
lanc1_anc0_expanded <- lanc1_anc0[intervals,]
lanc0_anc1_expanded <- lanc0_anc1[intervals,]
lanc1_anc1_expanded <- lanc1_anc1[intervals,]
save(lanc0_anc0_expanded,lanc1_anc0_expanded,file="GTP.rfmix.chr22.msp.tsv.lanc0.anc0.R",quote=F,row.names=F)
save(lanc0_anc1_expanded,lanc1_anc1_expanded,file="GTP.rfmix.chr22.msp.tsv.lanc.anc1.R",quote=F,row.names=F)
save(haps0a,haps1a,file="GTP.rfmix.chr22.msp.tsv.haps.R",quote=F,row.names=F)
#Currently only coded for 0/1, but expansible to 3...
#at this point I should switch the data format to matrices to speed things up!
#N copies of risk allele for ancestry 0
hap0_anc0 <- lanc0_anc0_expanded * haps0a
hap1_anc0 <- lanc1_anc0_expanded * haps1a
anc0_ncopies <- hap0_anc0 + hap1_anc0
#N copies of risk allele for ancestry 1
hap0_anc1 <- lanc0_anc1_expanded * haps0[,c(10:ncol(haps0))]
hap1_anc1 <- lanc1_anc1_expanded * haps1[,c(10:ncol(haps0))]
anc1_ncopies <- hap0_anc1 + hap1_anc1
#And so on for N groups > 2...
#Now have matrices where each row is a SNP and each column is a subject. Need to transpose to make this usable.
anc0_ncopies_t <- t(anc0_ncopies)
anc1_ncopies_t <- t(anc1_ncopies)
#Now each row is a subject and column is a SNP. this can now be mated with the phenotype/covariate file
p <- match_order(d2$idx, d1$idx)
#Then just apply over all SNPs with a common formula
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment