Skip to content

Instantly share code, notes, and snippets.

@eacooper400
Last active January 27, 2024 01:52
Show Gist options
  • Save eacooper400/c954c757db02eb6c4ccfae1aa090658c to your computer and use it in GitHub Desktop.
Save eacooper400/c954c757db02eb6c4ccfae1aa090658c to your computer and use it in GitHub Desktop.
R Functions for GEN 8900 Comp. Genomics
### Calculate Allele Frequencies from Genotype Counts
allele.freq <- function(genotypeCounts) {
n = sum(genotypeCounts) - genotypeCounts["NN"]
p = ((2*genotypeCounts["AA"]) + genotypeCounts["Aa"])/(2*n)
q = 1-p
freqs = c(p,q)
names(freqs) = c("p", "q")
return(freqs)
}
### Calculate the LD parameter R-squared from 2 rows of a VCF file
calc_r2 <- function(row1, row2) {
g1 = get.field(row1[10:length(row1)], row1[9], "GT")
g2 = get.field(row2[10:length(row2)], row2[9], "GT")
pA = unname(allele.freq(count.genotypes(g1))["p"])
pB = unname(allele.freq(count.genotypes(g2))["p"])
h = get.haplotypes(g1, g2)
pAB = (length(h[h=="00"]))/(length(h))
D = pAB - (pA * pB)
rsq = (D**2)/(pA*(1-pA)*pB*(1-pB))
return(rsq)
}
### Count the AA, Aa, and aa genotypes in a sample
count.genotypes <- function(genotypes) {
genotypes = gsub("(\\||/)", "", genotypes)
gen.patterns = c("00", "01", "10", "11", "..")
my.counts=table(factor(genotypes, levels=gen.patterns))
final.counts = c(my.counts[1], (my.counts[2]+my.counts[3]), my.counts[4:5])
names(final.counts) = c("AA", "Aa", "aa", "NN")
return(final.counts)
}
### Count the number of derived alleles for a VCF row
derivedCount <- function(row) {
row=as.vector(row, mode="character")
x=count.genotypes(get.field(row[10:length(row)], row[9], "GT"))
dc=(2*x["aa"])+x["Aa"]
return(unname(dc))
}
### Calculate Nucleotide Divergence (Dxy)
dxy <- function(vcf1, vcf2, perBP=TRUE) {
g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"])
af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"])
## Let x be the allele frequency (p) in pop1 * (1-p) in Pop2
x = af1 * (1-af2)
## Let y be the allele frequency (p) in pop2 * (1-p) in Pop1
y = af2 * (1-af1)
dxy=sum((x+y))
if (perBP) {
c = unique(vcf1$CHROM)
s = sapply(c, function(x,y) min(y[which(y$CHROM==x),2]), y=vcf1)
e = sapply(c, function(x,y) max(y[which(y$CHROM==x),2]), y=vcf1)
bp=sum(e-s)
return(dxy/bp)
} else {
return(dxy)
}
}
### Calculate Expected Het. and return, p, n, and H
expected.het <- function(genotypes) {
obs.counts = count.genotypes(genotypes)
n = sum(obs.counts) - obs.counts["NN"]
freqs = allele.freq(obs.counts)
Hexp = 2 * freqs[1] * freqs[2]
res = c(freqs["p"], n, Hexp)
res=as.numeric(unname(res))
return(res)
}
### Determine which SNPs are Polymorphic vs Fixed in 2 species
fixed.poly <- function(vcf1, vcf2) {
g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"])
af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"])
res = rep("Polymorphic", nrow(g1))
res[which(abs(af1-af2)==1)] = "Fixed"
return(res)
}
### Get a Specified Field From a VCF Sample/Genotype String
get.field <- function(samples, format, fieldName) {
x=strsplit(samples, split=":")
fields=unlist(strsplit(format, split=":"))
i=which(fields==fieldName)
if (!(fieldName %in% fields)) stop('fieldName not found in format fields')
return(sapply(x, `[[`, i))
}
### Get the HAPLOTYPES for a pair of genotype strings
get.haplotypes <- function(genotypes1, genotypes2) {
a1 = gsub("\\|", "", genotypes1)
a2 = gsub("\\|", "", genotypes2)
a1=unlist(strsplit(paste0(a1, collapse=""), split=""))
a2=unlist(strsplit(paste0(a2, collapse=""), split=""))
haps = paste0(a1,a2)
return(haps)
}
### Calculate minor allele frequency
maf <- function(vcf.row) {
temp=as.vector(vcf.row, mode="character")
af=allele.freq(count.genotypes(get.field(temp[10:length(temp)], temp[9], "GT")))
maf=min(unname(af))
return(maf)
}
### Calculate Nucleotide Diversity (Pi)
pi.diversity <- function(vcf, perBP=TRUE) {
J=apply(vcf, 1, derivedCount)
N=apply(vcf, 1, function(x) sum(count.genotypes(get.field(x[10:length(x)], x[9], "GT"))))
C=2*N
pi = sum((2*J*(C-J))/(C*(C-1)))
if (perBP) {
c = unique(vcf$CHROM)
s = sapply(c, function(x,y) min(y[which(y$CHROM==x),2]), y=vcf)
e = sapply(c, function(x,y) max(y[which(y$CHROM==x),2]), y=vcf)
bp=sum(e-s)
return(pi/bp)
} else { return(pi) }
}
### Read in a VCF file as a table
read.vcf <- function(file, special.char="##", ...) {
my.search.term=paste0(special.char, ".*")
all.lines=readLines(file)
clean.lines=gsub(my.search.term, "", all.lines)
clean.lines=gsub("#CHROM", "CHROM", clean.lines)
read.table(..., text=paste(clean.lines, collapse="\n"))
}
### Calculate the variance for Tajima's D
variance.d <- function(n,S) {
a1=sum(1/(seq(from=1, to=(n-1), by=1)))
a2=sum(1/((seq(from=1, to=(n-1), by=1))**2))
b1=(n+1)/(3*(n-1))
b2=(2*((n**2)+n+3))/((9*n)*(n-1))
c1=b1 - (1/a1)
c2=b2-((n+2)/(a1*n)) + (a2/(a1**2))
e1=c1/a1
e2=c2/((a1**2)+a2)
var=(e1*S) + (e2*S*(S-1))
return(var)
}
### Calculate Waterson's theta
waterson.theta <- function(data, perBP=TRUE) {
num.bp=nrow(data)
check=apply(data, 1, FUN=maf)
filter=data[which(check>0),]
Sn=nrow(filter)
n.ind=ncol(filter)-9
i=seq(1, ((2*n.ind)-1))
theta=Sn/sum(1/i)
if (perBP) { return(theta/num.bp) }
else { return(theta) }
}
@eacooper400
Copy link
Author

eacooper400 commented Sep 6, 2017

Function Descriptions and Examples

allele.freq

Description

Returns a vector of 2 allele frequencies, p and q, based on a
starting vector of genotype counts (such as the named vector
returned by count.genotypes().

Usage
allele.freq <- function(genotypeCounts) {
    n = sum(genotypeCounts) - genotypeCounts["NN"]
    p = ((2*genotypeCounts["AA"]) + genotypeCounts["Aa"])/(2*n)
    q = 1-p
    return(c(p,q))
}
Arguments
genotypeCounts
A named vector of of genotype counts, such as the one returned by count.genotypes(). Vector names should be c("AA", "Aa", "aa", "NN").
Examples

Set up a hypothetical vector of genotype counts, where there are 30 AA genotypes,
22 Aa, 15 aa, and 5 missing:

my.counts = c(30, 22, 15, 5)
names(my.counts) = c("AA", "Aa", "aa", "NN")

my.counts
AA Aa aa NN 
30 22 15  5 

Run allele.freq to calculate the frequency of the 'A' allele (p) and
the 'a' allele (q):

allele.freq(my.counts)
        p         q 
0.6119403 0.3880597 

calc_r2

Description

Calculates and returns a single numeric value for r-squared, based on 2 input rows in VCF format (which are interpreted by the function as containing phased genotypes from two separate SNP positions).

Usage
calc_r2 = function(row1, row2) {
	g1 = get.field(row1[10:length(row1)], row1[9], "GT")
	g2 = get.field(row2[10:length(row2)], row2[9], "GT")
	pA = unname(allele.freq(count.genotypes(g1))["p"])
	pB = unname(allele.freq(count.genotypes(g2))["p"])
	h = get.haplotypes(g1, g2)
	pAB = (length(h[h=="00"]))/(length(h))
	D = pAB - (pA * pB)
	rsq = (D**2)/(pA*(1-pA)*pB*(1-pB))
	return(rsq)
}
Arguments

2 character vectors corresponding to rows from a VCF file.

Examples

Given the following VCF file:

my.data
  CHROM      POS ID REF ALT QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5 Ind6
2     1 16251945  .   T   C    .   PASS    .     GT  0|1  1|0  0|1  0|1  0|1  1|1
3     1 16252180  .   G   A    .   PASS    .     GT  1|0  0|1  1|0  1|0  1|0  0|1
4     1 16254123  .   A   C    .   PASS    .     GT  1|0  1|1  1|0  1|0  1|0  0|1
5     1 16254567  .   A   G    .   PASS    .     GT  1|1  0|0  1|1  1|0  1|1  1|1
6     1 16255047  .   A   G    .   PASS    .     GT  1|1  1|0  1|1  1|1  1|1  1|1

Use the function on the first 2 rows:

calc_r2(row1=as.vector(my.data[1,], mode="character"), row2=as.vector(my.data[2,], mode="character"))
[1] 0.7142857

count.genotypes

Description

Counts the number of 'AA', 'Aa', 'aa', and missing genotypes in a string of
genotypes in VCF format (0/0, 0/1, 1/1, or 0|0, 0|1, 1|0, 1|1). Returns a named vector of
4 values: AA, Aa, aa, NN.

Usage
count.genotypes <- function(genotypes) {
    genotypes = gsub("(\\||/)", "", genotypes) 
    gen.patterns = c("00", "01", "10", "11", "..") 
    my.counts=table(factor(genotypes, levels=gen.patterns)) 
    final.counts = c(my.counts[1], (my.counts[2]+my.counts[3]), my.counts[4:5]) 
    names(final.counts) = c("AA", "Aa", "aa", "NN") 
    return(final.counts)
}
Arguments
genotypes
A vector of sample strings in the format 0/0 or 0|1, such as the vector returned by get.field when fieldName="GT".
Examples

Create a hypothetical genotype vector with 3 copies of each genotype:

my.genotypes = rep(c("0/0", "0/1", "1/1"), 3)
my.genotypes
[1] "0/0" "0/1" "1/1" "0/0" "0/1" "1/1" "0/0" "0/1" "1/1"

Now, run count.genotypes to get the counts:

count.genotypes(my.genotypes)
AA Aa aa NN 
 3  3  3  0 

derivedCount

Description

Takes as input a single row from a VCF formatted file, and outputs the count of derived (ALT) alleles at that row.

Usage
derivedCount <- function(row) {
    row=as.vector(row, mode="character")
    x=count.genotypes(get.field(row[10:length(row)], row[9], "GT"))
    dc=(2*x["aa"])+x["Aa"]
    return(dc)
}
Arguments

Requires a single vector in the format of a row from a VCF file.

Examples

Given a VCF row like this:

my.row=c("1", "68020018", " .", "C",  "G", "1490.64"," ."," ." , "GT:AD:DP:GQ:PL","1/1:12,0:12:36:0,36,478", "0/0:6,0:6:18:0,18,178", "0/0:7,0:7:21:0,21,278", "0/0:4,0:4:9:0,9,135", "0/0:10,0:10:30:0,30,380", "0/1:11,0:11:33:0,33,426")

Used derivedCount like this:

derivedCount(row=my.row)
aa 
 3 

dxy

Description

Takes as input 2 VCF formatted tables (such as created by using read.vcf() and then extracting 2 subsets based on colnames()), and calculates a single value for nucleotide divergence (Dxy) between the 2 subsets.

Usage
dxy <- function(vcf1, vcf2, perBP=TRUE) {
    g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
    g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
    af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"])
    af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"])
    ## Let x be the allele frequency (p) in pop1 * (1-p) in Pop2
    x = af1 * (1-af2)
    ## Let y be the allele frequency (p) in pop2 * (1-p) in Pop1
    y = af2 * (1-af1)
    dxy=sum((x+y))
    if (perBP) {
        c = unique(vcf1$CHROM)
        s = sapply(c, function(x,y) min(y[which(y$CHROM==x),2]), y=vcf1)
        e = sapply(c, function(x,y) max(y[which(y$CHROM==x),2]), y=vcf1)
        bp=sum(e-s)
        return(dxy/bp)
    } else { return(dxy) }
}
Arguments

vcf1 and vcf2 must each be a data frame with VCF formatted data. Both data frames must have the same number of rows. By default, dxy will be calculated as a per base pair rate, unless the argument perBP=FALSE is provided. The function assumes no missing data between the first and last position when calculating the number of base pairs.

Examples

Given the data set below:

test.data
  CHROM     POS        ID REF ALT    QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5 Ind6 Ind7 Ind8 Ind9 Ind10
1     1            6051277  .   A     T        204.6       .       .           GT        ./.    1/1    ./.      ./.     ./.     0/0  0/0    ./.    0/0     ./.
2     1           6051407  .   A     G      1009.66     .       .           GT       1/1    1/1   1/1     0/0   0/0   0/0  0/0   0/0   0/0   0/0
3     1           6051488  .   C     A       699.84     .       .           GT       0/0   0/1   1/1     0/0   0/0   0/0  0/0   0/0   0/0   0/0
4     1           6051602  .   C     T        29.12       .       .           GT       0/0   0/0  0/0     0/0   0/0   0/0  0/0   0/0   0/1   0/0
5     1           6051833  .   T     C       178.88      .       .           GT       1/1    0/0  0/0     0/0   0/0   0/0  0/0   0/0   0/0   0/0

Split Individuals 1-5 and 6-10 into 2 groups data frames, keeping the first 9 columns in each:

group1=test.data[,-c(15:19)]
print(group1)
  CHROM     POS ID REF ALT    QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5
1     1            6051277  .   A   T   204.6      .    .     GT    ./.  1/1    ./.    ./.    ./.
2     1            6051407  .   A   G 1009.66      .    .     GT  1/1  1/1  1/1  0/0  0/0
3     1            6051488  .   C   A  699.84      .    .     GT  0/0  0/1  1/1  0/0  0/0
4     1            6051602  .   C   T   29.12      .    .     GT  0/0  0/0  0/0  0/0  0/0
5     1             6051833  .   T   C  178.88      .    .     GT  1/1  0/0  0/0  0/0  0/0

group2=test.data[,-c(10:14)]
group2
  CHROM     POS ID REF ALT    QUAL FILTER INFO FORMAT Ind6 Ind7 Ind8 Ind9 Ind10
1     1 6051277  .   A   T   204.6      .    .     GT  0/0  0/0    ./.  0/0     ./.
2     1 6051407  .   A   G 1009.66      .    .     GT  0/0  0/0  0/0  0/0   0/0
3     1 6051488  .   C   A  699.84      .    .     GT  0/0  0/0  0/0  0/0   0/0
4     1 6051602  .   C   T   29.12      .    .     GT  0/0  0/0  0/0  0/1   0/0
5     1 6051833  .   T   C  178.88      .    .     GT  0/0  0/0  0/0  0/0   0/0

And run the dxy function to calculate nucleotide divergence between them:

dxy(group1, group2, perBP=TRUE)
[1] 0.003956835
dxy(group1, group2, perBP=FALSE)
[1] 2.2

expected.het

Description

Takes as input a vector of sample genotypes (as returned by get.field) and returns a vector of 3 values: p (reference allele frequency), n (sample size), and Hexp (expected heterozygosity).

Usage
expected.het <- function(genotypes) {
    obs.counts = count.genotypes(genotypes)
    n = sum(obs.counts) - obs.counts["NN"]
    freqs = allele.freq(obs.counts)
    Hexp = 2 * freqs[1] * freqs[2]
    res = c(freqs["p"], n, Hexp)
    res=as.numeric(unname(res))
    return(res)
}
Arguments
genotypes
A vector of sample strings in the format 0/0 or 0|1, such as the vector returned by get.field when fieldName="GT".
Examples

Create a hypothetical genotype vector with 3 copies of each genotype:

my.genotypes = rep(c("0/0", "0/1", "1/1"), 3)
my.genotypes
[1] "0/0" "0/1" "1/1" "0/0" "0/1" "1/1" "0/0" "0/1" "1/1"

Now, run expected.het to get p, N, and Hexp:

expected.het(my.genotypes)
[1] 0.5 9.0 0.5

fixed.poly

Description

Classifies each SNP in a set of data as either "Fixed" or "Polymorphic" between 2 species/populations. Takes as input 2 VCF formatted tables (such as created by using read.vcf() and then extracting 2 subsets based on colnames()), and returns a vector with one of 2 possible values for each row in the vcf: "Fixed" or "Polymorphic."

Usage
fixed.poly = function(vcf1, vcf2) {
    g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
    g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
    af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"])
    af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"])
    res = rep("Polymorphic", nrow(g1))
    res[which(abs(af1-af2)==1)] = "Fixed"
    return(res)
}
Arguments

Requires 2 data frames with VCF formatted data. The data frames MUST have the same number of rows, but do not need to have the same number of columns. fixed.poly will compare the allele frequencies between the two data frames to classify SNPs as shared or polymorphic.

Examples

Using the Polar Bear data set from Week 9:

my.data=read.vcf("SampleData/bears_wAnn.vcf", header=TRUE, stringsAsFactors=FALSE)

brown.col=grep("^Ua", colnames(my.data))
polar.col=grep("^Umar", colnames(my.data))

brown.data=my.data[,-polar.col]
polar.data=my.data[,-brown.col]

SiteClass=fixed.poly(brown.data, polar.data)
head(SiteClass)
[1] "Fixed"       "Fixed"       "Polymorphic" "Polymorphic" "Polymorphic" "Polymorphic"

get.field

Description

Parses the information from a vector of sample genotype strings in VCF
format (e.g. "0/1:2,15:17:5,0,36") and returns a vector with only a
single specifed field for each sample.

Usage
get.field <- function(samples, format, fieldName) {
	x=strsplit(samples, split=":") 
	fields=unlist(strsplit(format, split=":")) 
	i=which(fields==fieldName)
	if (!(fieldName %in% fields)) stop('fieldName not found in format fields')
        return(sapply(x, `[[`, i)) 
   }
Arguments
sample
A vector of VCF format genotype strings from the sample column(s) of a VCF file.
format
The string from the FORMAT column (usually column 9) of a VCF file. Defines which fields are present.
fieldName
A character string specifying which field to return. Example: "GT", "GP", "AD", "DP", etc.
Examples

Starting with a single row from an example VCF file that looks like this:

vcf.row <- c( "1", "32306701", "S1_32306701", "G", "A",  "34", "PASS", ".",	"GT:AD:DP:GQ:PL",
"1/1:4,14:18:14:22,14,0",  "0/1:9,3:12:13:23,13,0" ,"0/0:12,1:13:1:1,6,0",  "0/0:21,1:22:1:1,20,0",
"0/1:19,5:24:1:18,1,0", "0/0:16,3:19:10:10,15,0")

We can extract the genotype (GT) field from each of the 6 samples like this:

get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="GT")
[1] "1/1" "0/1" "0/0" "0/0" "0/1" "0/0"

Similarly, we can get the Allele Depth (AD) field:

get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="AD")
[1] "4,14" "9,3"  "12,1" "21,1" "19,5" "16,3"

Or any of the other 3 fields present in this file:

get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="DP")
[1] "18" "12" "13" "22" "24" "19"
get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="GQ")
[1] "14" "13" "1"  "1"  "1"  "10"
get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="PL")
[1] "22,14,0" "23,13,0" "1,6,0"   "1,20,0"  "18,1,0"  "10,15,0"

get.haplotypes

Description

Takes as input two genotype strings (from two different rows of a VCF file), such as those generated by the get.field() function. Returns a single string of individual haplotypes in the form "00", "01", "10", or "11".

Usage
get.haplotypes <- function(genotypes1, genotypes2) {
    a1 = gsub("\\|", "", genotypes1) 
    a2 = gsub("\\|", "", genotypes2)
    a1=unlist(strsplit(paste0(a1, collapse=""), split="")) 
    a2=unlist(strsplit(paste0(a2, collapse=""), split=""))
    haps = paste0(a1,a2)
    return(haps)
}
Arguments
genotypes1
A string of genotypes, such as output by get.field()
genotypes2
A string of genotypes, such as output by get.field()
Examples

Given the 2 genotype strings below:

g1 = c("0|1", "0|1", "1|1", "0|0", "1|1")
g2 = c("1|1", "1|0", "0|0", "0|1", "0|1")

use the function to get the single haplotype string:

get.haplotypes(genotypes1=g1, genotypes2=g2)
 [1] "01" "11" "01" "10" "10" "10" "00" "01" "10" "11"

maf

Description

Returns a calculation of the Minor Allele Frequency for a single row (SNP position) in a VCF format file. Assumes bi-allelic loci.

Usage
maf <- function(vcf.row) {
	temp=as.vector(vcf.row, mode="character")
	af=allele.freq(count.genotypes(get.field(temp[10:length(temp)], temp[9], "GT")))
	maf=min(unname(af))
	return(maf)
}
Arguments

A single vector corresponding to 1 row of a VCF format file.

Examples

If we create an example row in VCF format like this:

test=c("1","826","373733_57" ,"T", "G",".",".",".","GT:DP:GQ","0/0:17:30","0/0:18:28","0/0:38:31", "0/0:39:16", "0/1:12:33", "0/0:1:20", "0/0:37:6", "0/0:15:4", "1/1:12:1", "0/0:7:22","0/0:19:17")

We run the maf() function on it like this:

maf(vcf.row=test)
[1] 0.1363636

pi.diversity

Description

Takes as input a data frame in VCF format (such as created by read.vcf()) and calculates a single estimate of the Nucleotide Diversity (Pi). By default, this will calculate Pi per base pair, unless perBP is set to FALSE.

Usage
pi.diversity = function(vcf, perBP=TRUE) {
    J=apply(vcf, 1, derivedCount)
    N=apply(vcf, 1, function(x) sum(count.genotypes(get.field(x[10:length(x)], x[9], "GT"))))
    pi = sum((2*J*(C-J))/(C*(C-1)))
    if (perBP) {
        c = unique(vcf$CHROM)
        s = sapply(c, function(x,y) min(y[which(y$CHROM==x),2]), y=vcf)
        e = sapply(c, function(x,y) max(y[which(y$CHROM==x),2]), y=vcf)
        bp=sum(e-s)
        return(pi/bp)
    } else { return(pi) }
}
Arguments

A single VCF format data frame (either the original file read in by read.vcf or any subset() of this file). An optional setting of perBP=FALSE to return the raw value of Pi for the whole region. Since this function assumes no missing data between the positions of the first and last SNP, you may want to return the raw value and estimate the per base rate on your own if you believe this assumption is not met in your data.

Example

Given the data frame below:

test.data
  CHROM     POS ID REF ALT    QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5 Ind6 Ind7 Ind8 Ind9 Ind10
1     1 6051277  .   A   T   204.6      .    .     GT    .  1/1    .    .    .  0/0  0/0    .  0/0     .
2     1 6051407  .   A   G 1009.66      .    .     GT  1/1  1/1  1/1  0/0  0/0  0/0  0/0  0/0  0/0   0/0
3     1 6051488  .   C   A  699.84      .    .     GT  0/0  0/1  1/1  0/0  0/0  0/0  0/0  0/0  0/0   0/0
4     1 6051602  .   C   T   29.12      .    .     GT  0/0  0/0  0/0  0/0  0/0  0/0  0/0  0/0  0/1   0/0
5     1 6051833  .   T   C  178.88      .    .     GT  1/1  0/0  0/0  0/0  0/0  0/0  0/0  0/0  0/0   0/0

Use the function to calculate Pi:

pi.diversity(test.data, perBP=TRUE)
[1] 0.002569373
pi.diversity(test.data, perBP=FALSE)
[1] 1.428571

read.vcf

Description

Reads a VCF file into R as a tab-delimited table. Takes as input the
name of a VCF file, and returns an object in the form of a data frame.

Usage
read.vcf <- function(file, special.char="##", ...) {
    my.search.term=paste0(special.char, ".*")
    all.lines=readLines(file)
    clean.lines=gsub(my.search.term, "",  all.lines)
    clean.lines=gsub("#CHROM", "CHROM", clean.lines)
    read.table(..., text=paste(clean.lines, collapse="\n"))
	}
Arguments
file
The name of the VCF file to read in.
special.char
A special character string that denotes lines to ignore. Unless otherwise specified, this will always be "##".
...
Additional arguments that may be passed to the function `read.table()`.
Examples

If we have a VCF file called "sampleData.vcf" we can read it into R
like this:

my.data = read.vcf(file = "sampleData.vcf", header=TRUE,
stringsAsFactors=FALSE)

variance.d

Description

Calculates the variance of d (pi - waterson's theta) for the denominator of the Tajima's D equation.

Usage
variance.d <- function(n,S) {
    a1=sum(1/(seq(from=1, to=(n-1), by=1)))
    a2=sum(1/((seq(from=1, to=(n-1), by=1))**2))
    b1=(n+1)/(3*(n-1))
    b2=(2*((n**2)+n+3))/((9*n)*(n-1))
    c1=b1 - (1/a1)
    c2=b2-((n+2)/(a1*n)) + (a2/(a1**2))
    e1=c1/a1
    e2=c2/((a1**2)+a2)
    var=(e1*S) + (e2*S*(S-1))
    return(var)
	}
Arguments
n
The number of chromosomes (2 times the number of diploid samples).
S
The number of segregating sites (SNPs).
Examples

Given a gene with sequences for 30 individuals, and 100 SNP sites:

n = 2*30
S = 100
variance.d(n,S)
[1] 38.04112

waterson.theta

Description

Calculates a single estimate of Waterson's theta for an entire VCF formatted data table (such as one created by using read.vcf(). Can also run on any subset() of this table. Will automatically check which sites are variant (using the maf() function).

Usage
waterson.theta <- function(data, perBP=TRUE) {
        num.bp = nrow(data)
	check=apply(data, 1, FUN=maf)
	filter=data[which(check>0),]
	Sn=nrow(filter)
	n.ind=ncol(filter)-9
	i=seq(1, ((2*n.ind)-1))
	theta=Sn/sum(1/i)
	if (perBP) { return(theta/num.bp) }
        else { return(theta) }
}
Arguments

Takes a single data.frame variable as input, along with a TRUE/FALSE argument for whether or not to calculate the per base pair rate of theta. By default this function assumes that perBP=TRUE. The data.frame must have VCF formatted data for the function to work correctly.

Examples

Given the table below:

vcf.table=data.frame(CHROM=rep(1,5), POS=seq(1,5), ID=rep(".", 5), REF=c("A","T","C","C","G"), ALT=c("T","G","G","A","C"), QUAL=rep(30,5), FILTER=rep(".",5), INFO=rep(".",5), FORMAT=rep("GT",5), IND.1=c("0/0","0/1","0/1","1/1","1/1"),IND.2=c("0/0","0/1","0/1","1/1","1/1"),IND.3=c("0/0","0/1","0/1","1/1","1/1"), stringsAsFactors=FALSE)

vcf.table
  CHROM POS ID REF ALT QUAL FILTER INFO FORMAT IND.1 IND.2 IND.3
1     1   1  .   A   T   30      .    .     GT   0/0   0/0   0/0
2     1   2  .   T   G   30      .    .     GT   0/1   0/1   0/1
3     1   3  .   C   G   30      .    .     GT   0/1   0/1   0/1
4     1   4  .   C   A   30      .    .     GT   1/1   1/1   1/1
5     1   5  .   G   C   30      .    .     GT   1/1   1/1   1/1

Run the waterson.theta function:

waterson.theta(vcf.table, perBP=FALSE)
[1] 0.8759124

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment