Skip to content

Instantly share code, notes, and snippets.

@k-hench
Created October 8, 2018 15:07
Show Gist options
  • Save k-hench/90c3fc42a349f3de41e965acebcf302e to your computer and use it in GitHub Desktop.
Save k-hench/90c3fc42a349f3de41e965acebcf302e to your computer and use it in GitHub Desktop.
small R script to create a random vcf file for testing purposes
library(tidyverse)
# allele section -----------------
# available base pairs
BS <- c('A','T','C','G')
# sample random base pair
bs <- function(n,replace = TRUE,...) sample(x = BS, size = n, replace = replace, ...)
# sample a possible alternative allele for a single given base pair
alt_bs_1 <- function(bs){
stopifnot(length(bs)==1)
sample(x = (BS[BS != bs]), size = 1, replace = TRUE)}
# vectorization of the alternative allele sampling
alt_bs <- function(bs){purrr::map(bs,alt_bs_1)%>%unlist()}
# genotype section --------------------------------
# sample a single genotype from a given allele frequency
gt_1 <- function(freq){ sample(x = c(0,1),
size = 2,
replace = TRUE,
prob = c(1-freq,freq)) %>%
str_c(.,collapse = '/')
}
# vectorization of the genotype sampling
gt <- function(freqs){purrr::map(freqs,gt_1) %>%unlist()}
# enable genotype sampling for a single individual from an allele frequency tibble
gt_ind <- function(name,AF){AF %>% mutate(GT = gt(AF)) %>% select(GT) %>% set_names(.,nm=name)}
# sample genotypes for a popullation of individuals
gt_pop <- function(pops,ns_pop,AFp){
nms <- cross(list(P=1:pops,I=1:ns_pop)) %>% bind_rows() %>% mutate(id = str_c('p',P,'i',I))
purrr::map2(.x = nms$id, .y = AFp,.f = gt_ind) %>%
bind_cols()
}
# create a reference allele frequency tibble
af_pop <- function(RG){RG %>% mutate(AF = runif(n = length(RG$POS),min = 0,max = 1))}
# create an alterantive allele frequency tibble from a reference
af_alt <- function(pop,AF){AF %>% mutate(AF_altS = AF * smooth(runif(((dim(.))[1]),min = 0,max = 1),endrule = 'copy',twiceit = FALSE),
AF_altM = (1-AF) * smooth(runif(((dim(.))[1]),min = 0,max = 1),endrule = 'copy',twiceit = FALSE),
choice = sign(rnorm(dim(.)[1])),
AF_alt = ifelse(choice > 0, AF-AF_altS, AF+AF_altM)) %>%
select(CHROM,POS,AF_alt) %>% set_names(.,nm=c('CHOM','POS','AF'))}
# genome section ---------------------------------------
# create a reference genome structure witch multiple linkage groups of varying sizes
ref_genome <- function(n,sizes){
stopifnot(length(sizes) == n)
tibble(CHROM = str_c('LG',str_pad(string = 1:n,width = 2,pad = '0')),
SIZE = sizes)}
# sample random sites within the genome (SNPs)
pos_lg <- function(n,chrom = 'LG01',RG){
pmax <- RG %>% filter(CHROM == chrom) %>% select(SIZE) %>% unlist() %>% as.vector()
tibble(CHROM = rep(chrom,n),POS = sample(x = 1:pmax,size = n, replace = FALSE)) %>%
arrange(POS)
}
# create inventory of individuals for every population
pop_inv <- function(pop,ind){str_c('p',pop,'i',1:ind) %>%
str_c(.,collapse = '\n') %>%
write_lines(x = .,path = str_c("pop",pop,".txt"))}
# vcf simulation ----------------------
vcf_sim <- function(chroms,ns,pops,ns_pop){
# create the first 9 collumns containing the SNP info
COL_START <- purrr::map2(.x = ns,.y = chroms,.f = pos_lg,RG=RG) %>%
bind_rows() %>%
arrange(CHROM,POS) %>%
mutate(
ID = '.',
REF = bs(n = sum(ns)),
ALT = alt_bs(REF),
QUAL = '.',
FILTER = 'PASS',
INFO = '.',
FORMAT = 'GT'
)
# create the reference allele frequency table
AF_ref <- COL_START %>%
mutate(AF = runif(n = length(COL_START$POS),min = 0,max = 1)) %>%
select(CHROM,POS,AF)
# create one alternative allele frequency table per population
AF_s <- (purrr::map(1:pops,af_alt,AF=AF_ref)) %>% rep(.,each = ns_pop)
# create the genotype columns
GTs <- gt_pop(pops,ns_pop,AFp=AF_s)
# combine the SNP info and the genotypes
COL_START %>%
set_names(.,nm = c("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT")) %>%
bind_cols(.,GTs)
}
# input ----------------------
# get the current date for the file header
cur_date <- lubridate::now() %>% as.character() %>% str_extract(.,"[0-9]{4}-[0-9]{2}-[0-9]{2}")
# number of LGs within the genome
n_lg <- 8
# create a reference genome
RG <- ref_genome(n=n_lg,sizes=(2+runif(n_lg,min = -.5,max = .7))*10^7)
# number of populations
npop <- 2
# number of individulas per population
nind <- 45
# create the vcf body
vcf_body <- vcf_sim(ns = sample(x = 20:100,size = n_lg),chroms = str_c('LG0',1:n_lg),npop,nind)
# the output file name
file_name <- 'simulated.vcf'
# export ----------------------
# the vcf file
write_lines(x = str_c("##fileformat=VCFv4.2\n##fileDate=",cur_date,"\n##note=random_genotypes_created_by_vcf_sim.R"),path = file_name)
write_lines(x = str_c(names(vcf_body),collapse = '\t'),path = file_name,append = TRUE)
write_delim(x = vcf_body,path = file_name,delim = '\t',append = TRUE)
system(str_c("gzip ",file_name))
# the pop files
purrr::map(1:npop,pop_inv,ind=nind)
# test the created vcf with vcftools ----
# (calculate fst)
system(str_c("vcftools --gzvcf ",file_name,".gz --weir-fst-pop pop1.txt --weir-fst-pop pop2.txt --out ",file_name))
# read results & plot ------------------
# expand the reference genome to include LG start and end positions
ref <- RG %>%
mutate(START = cumsum(lag(SIZE,default = 0)),
END=START+SIZE,
GROUP = letters[(row_number()%%2)+1])
# read the fst results
fst <- read_delim(str_c(file_name,'.weir.fst'),delim = '\t') %>%
left_join(.,ref) %>%
mutate(GPOS = START+POS)
# plotting -----------------
# background gray
pgray <- rgb(.9,.9,.9)
# plot
ggplot(fst,aes(x=GPOS,y = WEIR_AND_COCKERHAM_FST))+
geom_rect(inherit.aes = FALSE,data = ref,
aes(xmin=START,xmax=END,ymin=-Inf,ymax=Inf,fill=GROUP))+
geom_point(aes(color = CHROM),alpha = .3)+
geom_smooth(se = FALSE,span=40,aes(color = CHROM))+
scale_x_continuous(expand = c(0,0))+
scale_fill_manual(values = c(NA, pgray),guide = FALSE)+
theme(plot.background = element_blank(),
panel.background = element_blank(), panel.grid = element_blank(),
panel.border = element_blank(), axis.title.x = element_blank(),
axis.line = element_line(), strip.background = element_rect(fill = NA,
color = pgray),
legend.background = element_rect(fill = "transparent", color = NA),
legend.key = element_rect(fill = "transparent", color = NA))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment