Created
October 8, 2018 15:07
-
-
Save k-hench/90c3fc42a349f3de41e965acebcf302e to your computer and use it in GitHub Desktop.
small R script to create a random vcf file for testing purposes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# allele section ----------------- | |
# available base pairs | |
BS <- c('A','T','C','G') | |
# sample random base pair | |
bs <- function(n,replace = TRUE,...) sample(x = BS, size = n, replace = replace, ...) | |
# sample a possible alternative allele for a single given base pair | |
alt_bs_1 <- function(bs){ | |
stopifnot(length(bs)==1) | |
sample(x = (BS[BS != bs]), size = 1, replace = TRUE)} | |
# vectorization of the alternative allele sampling | |
alt_bs <- function(bs){purrr::map(bs,alt_bs_1)%>%unlist()} | |
# genotype section -------------------------------- | |
# sample a single genotype from a given allele frequency | |
gt_1 <- function(freq){ sample(x = c(0,1), | |
size = 2, | |
replace = TRUE, | |
prob = c(1-freq,freq)) %>% | |
str_c(.,collapse = '/') | |
} | |
# vectorization of the genotype sampling | |
gt <- function(freqs){purrr::map(freqs,gt_1) %>%unlist()} | |
# enable genotype sampling for a single individual from an allele frequency tibble | |
gt_ind <- function(name,AF){AF %>% mutate(GT = gt(AF)) %>% select(GT) %>% set_names(.,nm=name)} | |
# sample genotypes for a popullation of individuals | |
gt_pop <- function(pops,ns_pop,AFp){ | |
nms <- cross(list(P=1:pops,I=1:ns_pop)) %>% bind_rows() %>% mutate(id = str_c('p',P,'i',I)) | |
purrr::map2(.x = nms$id, .y = AFp,.f = gt_ind) %>% | |
bind_cols() | |
} | |
# create a reference allele frequency tibble | |
af_pop <- function(RG){RG %>% mutate(AF = runif(n = length(RG$POS),min = 0,max = 1))} | |
# create an alterantive allele frequency tibble from a reference | |
af_alt <- function(pop,AF){AF %>% mutate(AF_altS = AF * smooth(runif(((dim(.))[1]),min = 0,max = 1),endrule = 'copy',twiceit = FALSE), | |
AF_altM = (1-AF) * smooth(runif(((dim(.))[1]),min = 0,max = 1),endrule = 'copy',twiceit = FALSE), | |
choice = sign(rnorm(dim(.)[1])), | |
AF_alt = ifelse(choice > 0, AF-AF_altS, AF+AF_altM)) %>% | |
select(CHROM,POS,AF_alt) %>% set_names(.,nm=c('CHOM','POS','AF'))} | |
# genome section --------------------------------------- | |
# create a reference genome structure witch multiple linkage groups of varying sizes | |
ref_genome <- function(n,sizes){ | |
stopifnot(length(sizes) == n) | |
tibble(CHROM = str_c('LG',str_pad(string = 1:n,width = 2,pad = '0')), | |
SIZE = sizes)} | |
# sample random sites within the genome (SNPs) | |
pos_lg <- function(n,chrom = 'LG01',RG){ | |
pmax <- RG %>% filter(CHROM == chrom) %>% select(SIZE) %>% unlist() %>% as.vector() | |
tibble(CHROM = rep(chrom,n),POS = sample(x = 1:pmax,size = n, replace = FALSE)) %>% | |
arrange(POS) | |
} | |
# create inventory of individuals for every population | |
pop_inv <- function(pop,ind){str_c('p',pop,'i',1:ind) %>% | |
str_c(.,collapse = '\n') %>% | |
write_lines(x = .,path = str_c("pop",pop,".txt"))} | |
# vcf simulation ---------------------- | |
vcf_sim <- function(chroms,ns,pops,ns_pop){ | |
# create the first 9 collumns containing the SNP info | |
COL_START <- purrr::map2(.x = ns,.y = chroms,.f = pos_lg,RG=RG) %>% | |
bind_rows() %>% | |
arrange(CHROM,POS) %>% | |
mutate( | |
ID = '.', | |
REF = bs(n = sum(ns)), | |
ALT = alt_bs(REF), | |
QUAL = '.', | |
FILTER = 'PASS', | |
INFO = '.', | |
FORMAT = 'GT' | |
) | |
# create the reference allele frequency table | |
AF_ref <- COL_START %>% | |
mutate(AF = runif(n = length(COL_START$POS),min = 0,max = 1)) %>% | |
select(CHROM,POS,AF) | |
# create one alternative allele frequency table per population | |
AF_s <- (purrr::map(1:pops,af_alt,AF=AF_ref)) %>% rep(.,each = ns_pop) | |
# create the genotype columns | |
GTs <- gt_pop(pops,ns_pop,AFp=AF_s) | |
# combine the SNP info and the genotypes | |
COL_START %>% | |
set_names(.,nm = c("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT")) %>% | |
bind_cols(.,GTs) | |
} | |
# input ---------------------- | |
# get the current date for the file header | |
cur_date <- lubridate::now() %>% as.character() %>% str_extract(.,"[0-9]{4}-[0-9]{2}-[0-9]{2}") | |
# number of LGs within the genome | |
n_lg <- 8 | |
# create a reference genome | |
RG <- ref_genome(n=n_lg,sizes=(2+runif(n_lg,min = -.5,max = .7))*10^7) | |
# number of populations | |
npop <- 2 | |
# number of individulas per population | |
nind <- 45 | |
# create the vcf body | |
vcf_body <- vcf_sim(ns = sample(x = 20:100,size = n_lg),chroms = str_c('LG0',1:n_lg),npop,nind) | |
# the output file name | |
file_name <- 'simulated.vcf' | |
# export ---------------------- | |
# the vcf file | |
write_lines(x = str_c("##fileformat=VCFv4.2\n##fileDate=",cur_date,"\n##note=random_genotypes_created_by_vcf_sim.R"),path = file_name) | |
write_lines(x = str_c(names(vcf_body),collapse = '\t'),path = file_name,append = TRUE) | |
write_delim(x = vcf_body,path = file_name,delim = '\t',append = TRUE) | |
system(str_c("gzip ",file_name)) | |
# the pop files | |
purrr::map(1:npop,pop_inv,ind=nind) | |
# test the created vcf with vcftools ---- | |
# (calculate fst) | |
system(str_c("vcftools --gzvcf ",file_name,".gz --weir-fst-pop pop1.txt --weir-fst-pop pop2.txt --out ",file_name)) | |
# read results & plot ------------------ | |
# expand the reference genome to include LG start and end positions | |
ref <- RG %>% | |
mutate(START = cumsum(lag(SIZE,default = 0)), | |
END=START+SIZE, | |
GROUP = letters[(row_number()%%2)+1]) | |
# read the fst results | |
fst <- read_delim(str_c(file_name,'.weir.fst'),delim = '\t') %>% | |
left_join(.,ref) %>% | |
mutate(GPOS = START+POS) | |
# plotting ----------------- | |
# background gray | |
pgray <- rgb(.9,.9,.9) | |
# plot | |
ggplot(fst,aes(x=GPOS,y = WEIR_AND_COCKERHAM_FST))+ | |
geom_rect(inherit.aes = FALSE,data = ref, | |
aes(xmin=START,xmax=END,ymin=-Inf,ymax=Inf,fill=GROUP))+ | |
geom_point(aes(color = CHROM),alpha = .3)+ | |
geom_smooth(se = FALSE,span=40,aes(color = CHROM))+ | |
scale_x_continuous(expand = c(0,0))+ | |
scale_fill_manual(values = c(NA, pgray),guide = FALSE)+ | |
theme(plot.background = element_blank(), | |
panel.background = element_blank(), panel.grid = element_blank(), | |
panel.border = element_blank(), axis.title.x = element_blank(), | |
axis.line = element_line(), strip.background = element_rect(fill = NA, | |
color = pgray), | |
legend.background = element_rect(fill = "transparent", color = NA), | |
legend.key = element_rect(fill = "transparent", color = NA)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment