Instantly share code, notes, and snippets.

# eacooper400/compGen17_fxns.R Secret

Last active January 27, 2024 01:52
Show Gist options
• Save eacooper400/c954c757db02eb6c4ccfae1aa090658c to your computer and use it in GitHub Desktop.
R Functions for GEN 8900 Comp. Genomics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 ### Calculate Allele Frequencies from Genotype Counts allele.freq <- function(genotypeCounts) { n = sum(genotypeCounts) - genotypeCounts["NN"] p = ((2*genotypeCounts["AA"]) + genotypeCounts["Aa"])/(2*n) q = 1-p freqs = c(p,q) names(freqs) = c("p", "q") return(freqs) } ### Calculate the LD parameter R-squared from 2 rows of a VCF file calc_r2 <- function(row1, row2) { g1 = get.field(row1[10:length(row1)], row1[9], "GT") g2 = get.field(row2[10:length(row2)], row2[9], "GT") pA = unname(allele.freq(count.genotypes(g1))["p"]) pB = unname(allele.freq(count.genotypes(g2))["p"]) h = get.haplotypes(g1, g2) pAB = (length(h[h=="00"]))/(length(h)) D = pAB - (pA * pB) rsq = (D**2)/(pA*(1-pA)*pB*(1-pB)) return(rsq) } ### Count the AA, Aa, and aa genotypes in a sample count.genotypes <- function(genotypes) { genotypes = gsub("(\\||/)", "", genotypes) gen.patterns = c("00", "01", "10", "11", "..") my.counts=table(factor(genotypes, levels=gen.patterns)) final.counts = c(my.counts[1], (my.counts[2]+my.counts[3]), my.counts[4:5]) names(final.counts) = c("AA", "Aa", "aa", "NN") return(final.counts) } ### Count the number of derived alleles for a VCF row derivedCount <- function(row) { row=as.vector(row, mode="character") x=count.genotypes(get.field(row[10:length(row)], row[9], "GT")) dc=(2*x["aa"])+x["Aa"] return(unname(dc)) } ### Calculate Nucleotide Divergence (Dxy) dxy <- function(vcf1, vcf2, perBP=TRUE) { g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"]) af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"]) ## Let x be the allele frequency (p) in pop1 * (1-p) in Pop2 x = af1 * (1-af2) ## Let y be the allele frequency (p) in pop2 * (1-p) in Pop1 y = af2 * (1-af1) dxy=sum((x+y)) if (perBP) { c = unique(vcf1\$CHROM) s = sapply(c, function(x,y) min(y[which(y\$CHROM==x),2]), y=vcf1) e = sapply(c, function(x,y) max(y[which(y\$CHROM==x),2]), y=vcf1) bp=sum(e-s) return(dxy/bp) } else { return(dxy) } } ### Calculate Expected Het. and return, p, n, and H expected.het <- function(genotypes) { obs.counts = count.genotypes(genotypes) n = sum(obs.counts) - obs.counts["NN"] freqs = allele.freq(obs.counts) Hexp = 2 * freqs[1] * freqs[2] res = c(freqs["p"], n, Hexp) res=as.numeric(unname(res)) return(res) } ### Determine which SNPs are Polymorphic vs Fixed in 2 species fixed.poly <- function(vcf1, vcf2) { g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT"))) af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"]) af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"]) res = rep("Polymorphic", nrow(g1)) res[which(abs(af1-af2)==1)] = "Fixed" return(res) } ### Get a Specified Field From a VCF Sample/Genotype String get.field <- function(samples, format, fieldName) { x=strsplit(samples, split=":") fields=unlist(strsplit(format, split=":")) i=which(fields==fieldName) if (!(fieldName %in% fields)) stop('fieldName not found in format fields') return(sapply(x, `[[`, i)) } ### Get the HAPLOTYPES for a pair of genotype strings get.haplotypes <- function(genotypes1, genotypes2) { a1 = gsub("\\|", "", genotypes1) a2 = gsub("\\|", "", genotypes2) a1=unlist(strsplit(paste0(a1, collapse=""), split="")) a2=unlist(strsplit(paste0(a2, collapse=""), split="")) haps = paste0(a1,a2) return(haps) } ### Calculate minor allele frequency maf <- function(vcf.row) { temp=as.vector(vcf.row, mode="character") af=allele.freq(count.genotypes(get.field(temp[10:length(temp)], temp[9], "GT"))) maf=min(unname(af)) return(maf) } ### Calculate Nucleotide Diversity (Pi) pi.diversity <- function(vcf, perBP=TRUE) { J=apply(vcf, 1, derivedCount) N=apply(vcf, 1, function(x) sum(count.genotypes(get.field(x[10:length(x)], x[9], "GT")))) C=2*N pi = sum((2*J*(C-J))/(C*(C-1))) if (perBP) { c = unique(vcf\$CHROM) s = sapply(c, function(x,y) min(y[which(y\$CHROM==x),2]), y=vcf) e = sapply(c, function(x,y) max(y[which(y\$CHROM==x),2]), y=vcf) bp=sum(e-s) return(pi/bp) } else { return(pi) } } ### Read in a VCF file as a table read.vcf <- function(file, special.char="##", ...) { my.search.term=paste0(special.char, ".*") all.lines=readLines(file) clean.lines=gsub(my.search.term, "", all.lines) clean.lines=gsub("#CHROM", "CHROM", clean.lines) read.table(..., text=paste(clean.lines, collapse="\n")) } ### Calculate the variance for Tajima's D variance.d <- function(n,S) { a1=sum(1/(seq(from=1, to=(n-1), by=1))) a2=sum(1/((seq(from=1, to=(n-1), by=1))**2)) b1=(n+1)/(3*(n-1)) b2=(2*((n**2)+n+3))/((9*n)*(n-1)) c1=b1 - (1/a1) c2=b2-((n+2)/(a1*n)) + (a2/(a1**2)) e1=c1/a1 e2=c2/((a1**2)+a2) var=(e1*S) + (e2*S*(S-1)) return(var) } ### Calculate Waterson's theta waterson.theta <- function(data, perBP=TRUE) { num.bp=nrow(data) check=apply(data, 1, FUN=maf) filter=data[which(check>0),] Sn=nrow(filter) n.ind=ncol(filter)-9 i=seq(1, ((2*n.ind)-1)) theta=Sn/sum(1/i) if (perBP) { return(theta/num.bp) } else { return(theta) } }

# Function Descriptions and Examples

### allele.freq

###### Description

Returns a vector of 2 allele frequencies, p and q, based on a
starting vector of genotype counts (such as the named vector
returned by count.genotypes().

###### Usage
``````allele.freq <- function(genotypeCounts) {
n = sum(genotypeCounts) - genotypeCounts["NN"]
p = ((2*genotypeCounts["AA"]) + genotypeCounts["Aa"])/(2*n)
q = 1-p
return(c(p,q))
}
``````
###### Arguments
genotypeCounts
A named vector of of genotype counts, such as the one returned by count.genotypes(). Vector names should be c("AA", "Aa", "aa", "NN").
###### Examples

Set up a hypothetical vector of genotype counts, where there are 30 AA genotypes,
22 Aa, 15 aa, and 5 missing:

``````my.counts = c(30, 22, 15, 5)
names(my.counts) = c("AA", "Aa", "aa", "NN")

my.counts
AA Aa aa NN
30 22 15  5
``````

Run `allele.freq` to calculate the frequency of the 'A' allele (p) and
the 'a' allele (q):

``````allele.freq(my.counts)
p         q
0.6119403 0.3880597
``````

### calc_r2

###### Description

Calculates and returns a single numeric value for r-squared, based on 2 input rows in VCF format (which are interpreted by the function as containing phased genotypes from two separate SNP positions).

###### Usage
``````calc_r2 = function(row1, row2) {
g1 = get.field(row1[10:length(row1)], row1[9], "GT")
g2 = get.field(row2[10:length(row2)], row2[9], "GT")
pA = unname(allele.freq(count.genotypes(g1))["p"])
pB = unname(allele.freq(count.genotypes(g2))["p"])
h = get.haplotypes(g1, g2)
pAB = (length(h[h=="00"]))/(length(h))
D = pAB - (pA * pB)
rsq = (D**2)/(pA*(1-pA)*pB*(1-pB))
return(rsq)
}
``````
###### Arguments

2 character vectors corresponding to rows from a VCF file.

###### Examples

Given the following VCF file:

``````my.data
CHROM      POS ID REF ALT QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5 Ind6
2     1 16251945  .   T   C    .   PASS    .     GT  0|1  1|0  0|1  0|1  0|1  1|1
3     1 16252180  .   G   A    .   PASS    .     GT  1|0  0|1  1|0  1|0  1|0  0|1
4     1 16254123  .   A   C    .   PASS    .     GT  1|0  1|1  1|0  1|0  1|0  0|1
5     1 16254567  .   A   G    .   PASS    .     GT  1|1  0|0  1|1  1|0  1|1  1|1
6     1 16255047  .   A   G    .   PASS    .     GT  1|1  1|0  1|1  1|1  1|1  1|1
``````

Use the function on the first 2 rows:

``````calc_r2(row1=as.vector(my.data[1,], mode="character"), row2=as.vector(my.data[2,], mode="character"))
[1] 0.7142857
``````

### count.genotypes

###### Description

Counts the number of 'AA', 'Aa', 'aa', and missing genotypes in a string of
genotypes in VCF format (0/0, 0/1, 1/1, or 0|0, 0|1, 1|0, 1|1). Returns a named vector of
4 values: AA, Aa, aa, NN.

###### Usage
``````count.genotypes <- function(genotypes) {
genotypes = gsub("(\\||/)", "", genotypes)
gen.patterns = c("00", "01", "10", "11", "..")
my.counts=table(factor(genotypes, levels=gen.patterns))
final.counts = c(my.counts[1], (my.counts[2]+my.counts[3]), my.counts[4:5])
names(final.counts) = c("AA", "Aa", "aa", "NN")
return(final.counts)
}
``````
###### Arguments
genotypes
A vector of sample strings in the format 0/0 or 0|1, such as the vector returned by get.field when fieldName="GT".
###### Examples

Create a hypothetical genotype vector with 3 copies of each genotype:

``````my.genotypes = rep(c("0/0", "0/1", "1/1"), 3)
my.genotypes
[1] "0/0" "0/1" "1/1" "0/0" "0/1" "1/1" "0/0" "0/1" "1/1"
``````

Now, run count.genotypes to get the counts:

``````count.genotypes(my.genotypes)
AA Aa aa NN
3  3  3  0
``````

### derivedCount

###### Description

Takes as input a single row from a VCF formatted file, and outputs the count of derived (ALT) alleles at that row.

###### Usage
``````derivedCount <- function(row) {
row=as.vector(row, mode="character")
x=count.genotypes(get.field(row[10:length(row)], row[9], "GT"))
dc=(2*x["aa"])+x["Aa"]
return(dc)
}
``````
###### Arguments

Requires a single vector in the format of a row from a VCF file.

###### Examples

Given a VCF row like this:

``````my.row=c("1", "68020018", " .", "C",  "G", "1490.64"," ."," ." , "GT:AD:DP:GQ:PL","1/1:12,0:12:36:0,36,478", "0/0:6,0:6:18:0,18,178", "0/0:7,0:7:21:0,21,278", "0/0:4,0:4:9:0,9,135", "0/0:10,0:10:30:0,30,380", "0/1:11,0:11:33:0,33,426")
``````

Used derivedCount like this:

``````derivedCount(row=my.row)
aa
3
``````

### dxy

###### Description

Takes as input 2 VCF formatted tables (such as created by using read.vcf() and then extracting 2 subsets based on colnames()), and calculates a single value for nucleotide divergence (Dxy) between the 2 subsets.

###### Usage
``````dxy <- function(vcf1, vcf2, perBP=TRUE) {
g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"])
af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"])
## Let x be the allele frequency (p) in pop1 * (1-p) in Pop2
x = af1 * (1-af2)
## Let y be the allele frequency (p) in pop2 * (1-p) in Pop1
y = af2 * (1-af1)
dxy=sum((x+y))
if (perBP) {
c = unique(vcf1\$CHROM)
s = sapply(c, function(x,y) min(y[which(y\$CHROM==x),2]), y=vcf1)
e = sapply(c, function(x,y) max(y[which(y\$CHROM==x),2]), y=vcf1)
bp=sum(e-s)
return(dxy/bp)
} else { return(dxy) }
}
``````
###### Arguments

`vcf1` and `vcf2` must each be a data frame with VCF formatted data. Both data frames must have the same number of rows. By default, `dxy` will be calculated as a per base pair rate, unless the argument `perBP=FALSE` is provided. The function assumes no missing data between the first and last position when calculating the number of base pairs.

###### Examples

Given the data set below:

``````test.data
CHROM     POS        ID REF ALT    QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5 Ind6 Ind7 Ind8 Ind9 Ind10
1     1            6051277  .   A     T        204.6       .       .           GT        ./.    1/1    ./.      ./.     ./.     0/0  0/0    ./.    0/0     ./.
2     1           6051407  .   A     G      1009.66     .       .           GT       1/1    1/1   1/1     0/0   0/0   0/0  0/0   0/0   0/0   0/0
3     1           6051488  .   C     A       699.84     .       .           GT       0/0   0/1   1/1     0/0   0/0   0/0  0/0   0/0   0/0   0/0
4     1           6051602  .   C     T        29.12       .       .           GT       0/0   0/0  0/0     0/0   0/0   0/0  0/0   0/0   0/1   0/0
5     1           6051833  .   T     C       178.88      .       .           GT       1/1    0/0  0/0     0/0   0/0   0/0  0/0   0/0   0/0   0/0
``````

Split Individuals 1-5 and 6-10 into 2 groups data frames, keeping the first 9 columns in each:

``````group1=test.data[,-c(15:19)]
print(group1)
CHROM     POS ID REF ALT    QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5
1     1            6051277  .   A   T   204.6      .    .     GT    ./.  1/1    ./.    ./.    ./.
2     1            6051407  .   A   G 1009.66      .    .     GT  1/1  1/1  1/1  0/0  0/0
3     1            6051488  .   C   A  699.84      .    .     GT  0/0  0/1  1/1  0/0  0/0
4     1            6051602  .   C   T   29.12      .    .     GT  0/0  0/0  0/0  0/0  0/0
5     1             6051833  .   T   C  178.88      .    .     GT  1/1  0/0  0/0  0/0  0/0

group2=test.data[,-c(10:14)]
group2
CHROM     POS ID REF ALT    QUAL FILTER INFO FORMAT Ind6 Ind7 Ind8 Ind9 Ind10
1     1 6051277  .   A   T   204.6      .    .     GT  0/0  0/0    ./.  0/0     ./.
2     1 6051407  .   A   G 1009.66      .    .     GT  0/0  0/0  0/0  0/0   0/0
3     1 6051488  .   C   A  699.84      .    .     GT  0/0  0/0  0/0  0/0   0/0
4     1 6051602  .   C   T   29.12      .    .     GT  0/0  0/0  0/0  0/1   0/0
5     1 6051833  .   T   C  178.88      .    .     GT  0/0  0/0  0/0  0/0   0/0
``````

And run the `dxy` function to calculate nucleotide divergence between them:

``````dxy(group1, group2, perBP=TRUE)
[1] 0.003956835
dxy(group1, group2, perBP=FALSE)
[1] 2.2
``````

### expected.het

###### Description

Takes as input a vector of sample genotypes (as returned by get.field) and returns a vector of 3 values: p (reference allele frequency), n (sample size), and Hexp (expected heterozygosity).

###### Usage
``````expected.het <- function(genotypes) {
obs.counts = count.genotypes(genotypes)
n = sum(obs.counts) - obs.counts["NN"]
freqs = allele.freq(obs.counts)
Hexp = 2 * freqs[1] * freqs[2]
res = c(freqs["p"], n, Hexp)
res=as.numeric(unname(res))
return(res)
}
``````
###### Arguments
genotypes
A vector of sample strings in the format 0/0 or 0|1, such as the vector returned by get.field when fieldName="GT".
###### Examples

Create a hypothetical genotype vector with 3 copies of each genotype:

``````my.genotypes = rep(c("0/0", "0/1", "1/1"), 3)
my.genotypes
[1] "0/0" "0/1" "1/1" "0/0" "0/1" "1/1" "0/0" "0/1" "1/1"
``````

Now, run expected.het to get p, N, and Hexp:

``````expected.het(my.genotypes)
[1] 0.5 9.0 0.5
``````

### fixed.poly

###### Description

Classifies each SNP in a set of data as either "Fixed" or "Polymorphic" between 2 species/populations. Takes as input 2 VCF formatted tables (such as created by using read.vcf() and then extracting 2 subsets based on colnames()), and returns a vector with one of 2 possible values for each row in the vcf: "Fixed" or "Polymorphic."

###### Usage
``````fixed.poly = function(vcf1, vcf2) {
g1=t(apply(vcf1, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
g2=t(apply(vcf2, 1, function(x) get.field(x[10:length(x)], x[9], "GT")))
af1=apply(g1, 1, function(x) allele.freq(count.genotypes(x))["p"])
af2=apply(g2, 1, function(x) allele.freq(count.genotypes(x))["p"])
res = rep("Polymorphic", nrow(g1))
res[which(abs(af1-af2)==1)] = "Fixed"
return(res)
}
``````
###### Arguments

Requires 2 data frames with VCF formatted data. The data frames MUST have the same number of rows, but do not need to have the same number of columns. fixed.poly will compare the allele frequencies between the two data frames to classify SNPs as shared or polymorphic.

###### Examples

Using the Polar Bear data set from Week 9:

``````my.data=read.vcf("SampleData/bears_wAnn.vcf", header=TRUE, stringsAsFactors=FALSE)

brown.col=grep("^Ua", colnames(my.data))
polar.col=grep("^Umar", colnames(my.data))

brown.data=my.data[,-polar.col]
polar.data=my.data[,-brown.col]

SiteClass=fixed.poly(brown.data, polar.data)
[1] "Fixed"       "Fixed"       "Polymorphic" "Polymorphic" "Polymorphic" "Polymorphic"
``````

### get.field

###### Description

Parses the information from a vector of sample genotype strings in VCF
format (e.g. "0/1:2,15:17:5,0,36") and returns a vector with only a
single specifed field for each sample.

###### Usage
``````get.field <- function(samples, format, fieldName) {
x=strsplit(samples, split=":")
fields=unlist(strsplit(format, split=":"))
i=which(fields==fieldName)
return(sapply(x, `[[`, i))
}
``````
###### Arguments
sample
A vector of VCF format genotype strings from the sample column(s) of a VCF file.
format
The string from the FORMAT column (usually column 9) of a VCF file. Defines which fields are present.
fieldName
A character string specifying which field to return. Example: "GT", "GP", "AD", "DP", etc.
###### Examples

Starting with a single row from an example VCF file that looks like this:

``````vcf.row <- c( "1", "32306701", "S1_32306701", "G", "A",  "34", "PASS", ".",	"GT:AD:DP:GQ:PL",
"1/1:4,14:18:14:22,14,0",  "0/1:9,3:12:13:23,13,0" ,"0/0:12,1:13:1:1,6,0",  "0/0:21,1:22:1:1,20,0",
"0/1:19,5:24:1:18,1,0", "0/0:16,3:19:10:10,15,0")
``````

We can extract the genotype (GT) field from each of the 6 samples like this:

``````get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="GT")
[1] "1/1" "0/1" "0/0" "0/0" "0/1" "0/0"
``````

Similarly, we can get the Allele Depth (AD) field:

``````get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="AD")
[1] "4,14" "9,3"  "12,1" "21,1" "19,5" "16,3"
``````

Or any of the other 3 fields present in this file:

``````get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="DP")
[1] "18" "12" "13" "22" "24" "19"
get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="GQ")
[1] "14" "13" "1"  "1"  "1"  "10"
get.field(samples = vcf.row[10:15], format=vcf.row[9], fieldName="PL")
[1] "22,14,0" "23,13,0" "1,6,0"   "1,20,0"  "18,1,0"  "10,15,0"
``````

### get.haplotypes

###### Description

Takes as input two genotype strings (from two different rows of a VCF file), such as those generated by the get.field() function. Returns a single string of individual haplotypes in the form "00", "01", "10", or "11".

###### Usage
``````get.haplotypes <- function(genotypes1, genotypes2) {
a1 = gsub("\\|", "", genotypes1)
a2 = gsub("\\|", "", genotypes2)
a1=unlist(strsplit(paste0(a1, collapse=""), split=""))
a2=unlist(strsplit(paste0(a2, collapse=""), split=""))
haps = paste0(a1,a2)
return(haps)
}
``````
###### Arguments
genotypes1
A string of genotypes, such as output by get.field()
genotypes2
A string of genotypes, such as output by get.field()
###### Examples

Given the 2 genotype strings below:

``````g1 = c("0|1", "0|1", "1|1", "0|0", "1|1")
g2 = c("1|1", "1|0", "0|0", "0|1", "0|1")
``````

use the function to get the single haplotype string:

``````get.haplotypes(genotypes1=g1, genotypes2=g2)
[1] "01" "11" "01" "10" "10" "10" "00" "01" "10" "11"
``````

### maf

###### Description

Returns a calculation of the Minor Allele Frequency for a single row (SNP position) in a VCF format file. Assumes bi-allelic loci.

###### Usage
``````maf <- function(vcf.row) {
temp=as.vector(vcf.row, mode="character")
af=allele.freq(count.genotypes(get.field(temp[10:length(temp)], temp[9], "GT")))
maf=min(unname(af))
return(maf)
}
``````
###### Arguments

A single vector corresponding to 1 row of a VCF format file.

###### Examples

If we create an example row in VCF format like this:

``````test=c("1","826","373733_57" ,"T", "G",".",".",".","GT:DP:GQ","0/0:17:30","0/0:18:28","0/0:38:31", "0/0:39:16", "0/1:12:33", "0/0:1:20", "0/0:37:6", "0/0:15:4", "1/1:12:1", "0/0:7:22","0/0:19:17")
``````

We run the `maf()` function on it like this:

``````maf(vcf.row=test)
[1] 0.1363636
``````

### pi.diversity

###### Description

Takes as input a data frame in VCF format (such as created by `read.vcf()`) and calculates a single estimate of the Nucleotide Diversity (Pi). By default, this will calculate Pi per base pair, unless `perBP` is set to `FALSE`.

###### Usage
``````pi.diversity = function(vcf, perBP=TRUE) {
J=apply(vcf, 1, derivedCount)
N=apply(vcf, 1, function(x) sum(count.genotypes(get.field(x[10:length(x)], x[9], "GT"))))
pi = sum((2*J*(C-J))/(C*(C-1)))
if (perBP) {
c = unique(vcf\$CHROM)
s = sapply(c, function(x,y) min(y[which(y\$CHROM==x),2]), y=vcf)
e = sapply(c, function(x,y) max(y[which(y\$CHROM==x),2]), y=vcf)
bp=sum(e-s)
return(pi/bp)
} else { return(pi) }
}
``````
###### Arguments

A single VCF format data frame (either the original file read in by `read.vcf` or any `subset()` of this file). An optional setting of `perBP=FALSE` to return the raw value of Pi for the whole region. Since this function assumes no missing data between the positions of the first and last SNP, you may want to return the raw value and estimate the per base rate on your own if you believe this assumption is not met in your data.

###### Example

Given the data frame below:

``````test.data
CHROM     POS ID REF ALT    QUAL FILTER INFO FORMAT Ind1 Ind2 Ind3 Ind4 Ind5 Ind6 Ind7 Ind8 Ind9 Ind10
1     1 6051277  .   A   T   204.6      .    .     GT    .  1/1    .    .    .  0/0  0/0    .  0/0     .
2     1 6051407  .   A   G 1009.66      .    .     GT  1/1  1/1  1/1  0/0  0/0  0/0  0/0  0/0  0/0   0/0
3     1 6051488  .   C   A  699.84      .    .     GT  0/0  0/1  1/1  0/0  0/0  0/0  0/0  0/0  0/0   0/0
4     1 6051602  .   C   T   29.12      .    .     GT  0/0  0/0  0/0  0/0  0/0  0/0  0/0  0/0  0/1   0/0
5     1 6051833  .   T   C  178.88      .    .     GT  1/1  0/0  0/0  0/0  0/0  0/0  0/0  0/0  0/0   0/0
``````

Use the function to calculate Pi:

``````pi.diversity(test.data, perBP=TRUE)
[1] 0.002569373
pi.diversity(test.data, perBP=FALSE)
[1] 1.428571
``````

###### Description

Reads a VCF file into R as a tab-delimited table. Takes as input the
name of a VCF file, and returns an object in the form of a data frame.

###### Usage
``````read.vcf <- function(file, special.char="##", ...) {
my.search.term=paste0(special.char, ".*")
clean.lines=gsub(my.search.term, "",  all.lines)
clean.lines=gsub("#CHROM", "CHROM", clean.lines)
}
``````
###### Arguments
file
The name of the VCF file to read in.
special.char
A special character string that denotes lines to ignore. Unless otherwise specified, this will always be "##".
...
###### Examples

If we have a VCF file called "sampleData.vcf" we can read it into R
like this:

``````my.data = read.vcf(file = "sampleData.vcf", header=TRUE,
stringsAsFactors=FALSE)
``````

### variance.d

###### Description

Calculates the variance of d (pi - waterson's theta) for the denominator of the Tajima's D equation.

###### Usage
``````variance.d <- function(n,S) {
a1=sum(1/(seq(from=1, to=(n-1), by=1)))
a2=sum(1/((seq(from=1, to=(n-1), by=1))**2))
b1=(n+1)/(3*(n-1))
b2=(2*((n**2)+n+3))/((9*n)*(n-1))
c1=b1 - (1/a1)
c2=b2-((n+2)/(a1*n)) + (a2/(a1**2))
e1=c1/a1
e2=c2/((a1**2)+a2)
var=(e1*S) + (e2*S*(S-1))
return(var)
}
``````
###### Arguments
n
The number of chromosomes (2 times the number of diploid samples).
S
The number of segregating sites (SNPs).
###### Examples

Given a gene with sequences for 30 individuals, and 100 SNP sites:

``````n = 2*30
S = 100
variance.d(n,S)
[1] 38.04112
``````

### waterson.theta

###### Description

Calculates a single estimate of Waterson's theta for an entire VCF formatted data table (such as one created by using `read.vcf()`. Can also run on any `subset()` of this table. Will automatically check which sites are variant (using the `maf()` function).

###### Usage
``````waterson.theta <- function(data, perBP=TRUE) {
num.bp = nrow(data)
check=apply(data, 1, FUN=maf)
filter=data[which(check>0),]
Sn=nrow(filter)
n.ind=ncol(filter)-9
i=seq(1, ((2*n.ind)-1))
theta=Sn/sum(1/i)
if (perBP) { return(theta/num.bp) }
else { return(theta) }
}
``````
###### Arguments

Takes a single data.frame variable as input, along with a TRUE/FALSE argument for whether or not to calculate the per base pair rate of theta. By default this function assumes that perBP=TRUE. The data.frame must have VCF formatted data for the function to work correctly.

###### Examples

Given the table below:

``````vcf.table=data.frame(CHROM=rep(1,5), POS=seq(1,5), ID=rep(".", 5), REF=c("A","T","C","C","G"), ALT=c("T","G","G","A","C"), QUAL=rep(30,5), FILTER=rep(".",5), INFO=rep(".",5), FORMAT=rep("GT",5), IND.1=c("0/0","0/1","0/1","1/1","1/1"),IND.2=c("0/0","0/1","0/1","1/1","1/1"),IND.3=c("0/0","0/1","0/1","1/1","1/1"), stringsAsFactors=FALSE)

vcf.table
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT IND.1 IND.2 IND.3
1     1   1  .   A   T   30      .    .     GT   0/0   0/0   0/0
2     1   2  .   T   G   30      .    .     GT   0/1   0/1   0/1
3     1   3  .   C   G   30      .    .     GT   0/1   0/1   0/1
4     1   4  .   C   A   30      .    .     GT   1/1   1/1   1/1
5     1   5  .   G   C   30      .    .     GT   1/1   1/1   1/1
``````

Run the `waterson.theta` function:

``````waterson.theta(vcf.table, perBP=FALSE)
[1] 0.8759124
``````