-
-
Save eacooper400/c954c757db02eb6c4ccfae1aa090658c to your computer and use it in GitHub Desktop.
R Functions for GEN 8900 Comp. Genomics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Calculate Allele Frequencies from Genotype Counts | |
allele.freq <- function(genotypeCounts) { | |
n = sum(genotypeCounts) - genotypeCounts["NN"] | |
p = ((2*genotypeCounts["AA"]) + genotypeCounts["Aa"])/(2*n) | |
q = 1-p | |
freqs = c(p,q) | |
names(freqs) = c("p", "q") | |
return(freqs) | |
} | |
### Calculate the LD parameter R-squared from 2 rows of a VCF file | |
calc_r2 <- function(row1, row2) { | |
g1 = get.field(row1[10:length(row1)], row1[9], "GT") | |
g2 = get.field(row2[10:length(row2)], row2[9], "GT") | |
pA = unname(allele.freq(count.genotypes(g1))["p"]) | |
pB = unname(allele.freq(count.genotypes(g2))["p"]) | |
h = get.haplotypes(g1, g2) | |
pAB = (length(h[h=="00"]))/(length(h)) | |
D = pAB - (pA * pB) | |
rsq = (D**2)/(pA*(1-pA)*pB*(1-pB)) | |
return(rsq) | |
} | |
### Count the AA, Aa, and aa genotypes in a sample | |
count.genotypes <- function(genotypes) { | |
genotypes = gsub("(\\||/)", "", genotypes) | |
gen.patterns = c("00", "01", "10", "11", "..") | |
my.counts=table(factor(genotypes, levels=gen.patterns)) | |
final.counts = c(my.counts[1], (my.counts[2]+my.counts[3]), my.counts[4:5]) | |
names(final.counts) = c("AA", "Aa", "aa", "NN") | |
return(final.counts) | |
} | |
### Count the number of derived alleles for a VCF row | |
derivedCount <- function(row) { | |
row=as.vector(row, mode="character") | |
x=count.genotypes(get.field(row[10:length(row)], row[9], "GT")) | |
dc=(2*x["aa"])+x["Aa"] | |
return(unname(dc)) | |
} | |
### Calculate Nucleotide Divergence (Dxy) | |
dxy <- function(vcf1, vcf2, perBP=TRUE) { | |
g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) | |
g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) | |
af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"]) | |
af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"]) | |
## Let x be the allele frequency (p) in pop1 * (1-p) in Pop2 | |
x = af1 * (1-af2) | |
## Let y be the allele frequency (p) in pop2 * (1-p) in Pop1 | |
y = af2 * (1-af1) | |
dxy=sum((x+y)) | |
if (perBP) { | |
c = unique(vcf1$CHROM) | |
s = sapply(c, function(x,y) min(y[which(y$CHROM==x),2]), y=vcf1) | |
e = sapply(c, function(x,y) max(y[which(y$CHROM==x),2]), y=vcf1) | |
bp=sum(e-s) | |
return(dxy/bp) | |
} else { | |
return(dxy) | |
} | |
} | |
### Calculate Expected Het. and return, p, n, and H | |
expected.het <- function(genotypes) { | |
obs.counts = count.genotypes(genotypes) | |
n = sum(obs.counts) - obs.counts["NN"] | |
freqs = allele.freq(obs.counts) | |
Hexp = 2 * freqs[1] * freqs[2] | |
res = c(freqs["p"], n, Hexp) | |
res=as.numeric(unname(res)) | |
return(res) | |
} | |
### Determine which SNPs are Polymorphic vs Fixed in 2 species | |
fixed.poly <- function(vcf1, vcf2) { | |
g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) | |
g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) | |
af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"]) | |
af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"]) | |
res = rep("Polymorphic", nrow(g1)) | |
res[which(abs(af1-af2)==1)] = "Fixed" | |
return(res) | |
} | |
### Get a Specified Field From a VCF Sample/Genotype String | |
get.field <- function(samples, format, fieldName) { | |
x=strsplit(samples, split=":") | |
fields=unlist(strsplit(format, split=":")) | |
i=which(fields==fieldName) | |
if (!(fieldName %in% fields)) stop('fieldName not found in format fields') | |
return(sapply(x, `[[`, i)) | |
} | |
### Get the HAPLOTYPES for a pair of genotype strings | |
get.haplotypes <- function(genotypes1, genotypes2) { | |
a1 = gsub("\\|", "", genotypes1) | |
a2 = gsub("\\|", "", genotypes2) | |
a1=unlist(strsplit(paste0(a1, collapse=""), split="")) | |
a2=unlist(strsplit(paste0(a2, collapse=""), split="")) | |
haps = paste0(a1,a2) | |
return(haps) | |
} | |
### Calculate minor allele frequency | |
maf <- function(vcf.row) { | |
temp=as.vector(vcf.row, mode="character") | |
af=allele.freq(count.genotypes(get.field(temp[10:length(temp)], temp[9], "GT"))) | |
maf=min(unname(af)) | |
return(maf) | |
} | |
### Calculate Nucleotide Diversity (Pi) | |
pi.diversity <- function(vcf, perBP=TRUE) { | |
J=apply(vcf, 1, derivedCount) | |
N=apply(vcf, 1, function(x) sum(count.genotypes(get.field(x[10:length(x)], x[9], "GT")))) | |
C=2*N | |
pi = sum((2*J*(C-J))/(C*(C-1))) | |
if (perBP) { | |
c = unique(vcf$CHROM) | |
s = sapply(c, function(x,y) min(y[which(y$CHROM==x),2]), y=vcf) | |
e = sapply(c, function(x,y) max(y[which(y$CHROM==x),2]), y=vcf) | |
bp=sum(e-s) | |
return(pi/bp) | |
} else { return(pi) } | |
} | |
### Read in a VCF file as a table | |
read.vcf <- function(file, special.char="##", ...) { | |
my.search.term=paste0(special.char, ".*") | |
all.lines=readLines(file) | |
clean.lines=gsub(my.search.term, "", all.lines) | |
clean.lines=gsub("#CHROM", "CHROM", clean.lines) | |
read.table(..., text=paste(clean.lines, collapse="\n")) | |
} | |
### Calculate the variance for Tajima's D | |
variance.d <- function(n,S) { | |
a1=sum(1/(seq(from=1, to=(n-1), by=1))) | |
a2=sum(1/((seq(from=1, to=(n-1), by=1))**2)) | |
b1=(n+1)/(3*(n-1)) | |
b2=(2*((n**2)+n+3))/((9*n)*(n-1)) | |
c1=b1 - (1/a1) | |
c2=b2-((n+2)/(a1*n)) + (a2/(a1**2)) | |
e1=c1/a1 | |
e2=c2/((a1**2)+a2) | |
var=(e1*S) + (e2*S*(S-1)) | |
return(var) | |
} | |
### Calculate Waterson's theta | |
waterson.theta <- function(data, perBP=TRUE) { | |
num.bp=nrow(data) | |
check=apply(data, 1, FUN=maf) | |
filter=data[which(check>0),] | |
Sn=nrow(filter) | |
n.ind=ncol(filter)-9 | |
i=seq(1, ((2*n.ind)-1)) | |
theta=Sn/sum(1/i) | |
if (perBP) { return(theta/num.bp) } | |
else { return(theta) } | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Function Descriptions and Examples
allele.freq
Description
Returns a vector of 2 allele frequencies, p and q, based on a
starting vector of genotype counts (such as the named vector
returned by count.genotypes().
Usage
Arguments
Examples
Set up a hypothetical vector of genotype counts, where there are 30 AA genotypes,
22 Aa, 15 aa, and 5 missing:
Run
allele.freq
to calculate the frequency of the 'A' allele (p) andthe 'a' allele (q):
calc_r2
Description
Calculates and returns a single numeric value for r-squared, based on 2 input rows in VCF format (which are interpreted by the function as containing phased genotypes from two separate SNP positions).
Usage
Arguments
2 character vectors corresponding to rows from a VCF file.
Examples
Given the following VCF file:
Use the function on the first 2 rows:
count.genotypes
Description
Counts the number of 'AA', 'Aa', 'aa', and missing genotypes in a string of
genotypes in VCF format (0/0, 0/1, 1/1, or 0|0, 0|1, 1|0, 1|1). Returns a named vector of
4 values: AA, Aa, aa, NN.
Usage
Arguments
Examples
Create a hypothetical genotype vector with 3 copies of each genotype:
Now, run count.genotypes to get the counts:
derivedCount
Description
Takes as input a single row from a VCF formatted file, and outputs the count of derived (ALT) alleles at that row.
Usage
Arguments
Requires a single vector in the format of a row from a VCF file.
Examples
Given a VCF row like this:
Used derivedCount like this:
dxy
Description
Takes as input 2 VCF formatted tables (such as created by using read.vcf() and then extracting 2 subsets based on colnames()), and calculates a single value for nucleotide divergence (Dxy) between the 2 subsets.
Usage
Arguments
vcf1
andvcf2
must each be a data frame with VCF formatted data. Both data frames must have the same number of rows. By default,dxy
will be calculated as a per base pair rate, unless the argumentperBP=FALSE
is provided. The function assumes no missing data between the first and last position when calculating the number of base pairs.Examples
Given the data set below:
Split Individuals 1-5 and 6-10 into 2 groups data frames, keeping the first 9 columns in each:
And run the
dxy
function to calculate nucleotide divergence between them:expected.het
Description
Takes as input a vector of sample genotypes (as returned by get.field) and returns a vector of 3 values: p (reference allele frequency), n (sample size), and Hexp (expected heterozygosity).
Usage
Arguments
Examples
Create a hypothetical genotype vector with 3 copies of each genotype:
Now, run expected.het to get p, N, and Hexp:
fixed.poly
Description
Classifies each SNP in a set of data as either "Fixed" or "Polymorphic" between 2 species/populations. Takes as input 2 VCF formatted tables (such as created by using read.vcf() and then extracting 2 subsets based on colnames()), and returns a vector with one of 2 possible values for each row in the vcf: "Fixed" or "Polymorphic."
Usage
Arguments
Requires 2 data frames with VCF formatted data. The data frames MUST have the same number of rows, but do not need to have the same number of columns. fixed.poly will compare the allele frequencies between the two data frames to classify SNPs as shared or polymorphic.
Examples
Using the Polar Bear data set from Week 9:
get.field
Description
Parses the information from a vector of sample genotype strings in VCF
format (e.g. "0/1:2,15:17:5,0,36") and returns a vector with only a
single specifed field for each sample.
Usage
Arguments
Examples
Starting with a single row from an example VCF file that looks like this:
We can extract the genotype (GT) field from each of the 6 samples like this:
Similarly, we can get the Allele Depth (AD) field:
Or any of the other 3 fields present in this file:
get.haplotypes
Description
Takes as input two genotype strings (from two different rows of a VCF file), such as those generated by the get.field() function. Returns a single string of individual haplotypes in the form "00", "01", "10", or "11".
Usage
Arguments
Examples
Given the 2 genotype strings below:
use the function to get the single haplotype string:
maf
Description
Returns a calculation of the Minor Allele Frequency for a single row (SNP position) in a VCF format file. Assumes bi-allelic loci.
Usage
Arguments
A single vector corresponding to 1 row of a VCF format file.
Examples
If we create an example row in VCF format like this:
We run the
maf()
function on it like this:pi.diversity
Description
Takes as input a data frame in VCF format (such as created by
read.vcf()
) and calculates a single estimate of the Nucleotide Diversity (Pi). By default, this will calculate Pi per base pair, unlessperBP
is set toFALSE
.Usage
Arguments
A single VCF format data frame (either the original file read in by
read.vcf
or anysubset()
of this file). An optional setting ofperBP=FALSE
to return the raw value of Pi for the whole region. Since this function assumes no missing data between the positions of the first and last SNP, you may want to return the raw value and estimate the per base rate on your own if you believe this assumption is not met in your data.Example
Given the data frame below:
Use the function to calculate Pi:
read.vcf
Description
Reads a VCF file into R as a tab-delimited table. Takes as input the
name of a VCF file, and returns an object in the form of a data frame.
Usage
Arguments
Examples
If we have a VCF file called "sampleData.vcf" we can read it into R
like this:
variance.d
Description
Calculates the variance of d (pi - waterson's theta) for the denominator of the Tajima's D equation.
Usage
Arguments
Examples
Given a gene with sequences for 30 individuals, and 100 SNP sites:
waterson.theta
Description
Calculates a single estimate of Waterson's theta for an entire VCF formatted data table (such as one created by using
read.vcf()
. Can also run on anysubset()
of this table. Will automatically check which sites are variant (using themaf()
function).Usage
Arguments
Takes a single data.frame variable as input, along with a TRUE/FALSE argument for whether or not to calculate the per base pair rate of theta. By default this function assumes that perBP=TRUE. The data.frame must have VCF formatted data for the function to work correctly.
Examples
Given the table below:
Run the
waterson.theta
function: