boopsboops/reflib-format.R

## reflib-format.R
#!/usr/bin/env Rscript
# script to subset and convert a reference library into fasta format

# load up libraries
library("tidyverse")
library("magrittr")
library("ape")

# load up the references using the `references-load.R` script
# loads objects: uk.species.table, uk.species.table.common, reflib.orig
source("https://raw.githubusercontent.com/boopsboops/reference-libraries/master/scripts/references-load.R")
source("https://raw.githubusercontent.com/legalLab/protocols-scripts/master/scripts/tab2fas.R")

# take a look at available primer sets in the dataset
sets <- grep("nucleotidesFrag.+\\.noprimers", names(reflib.orig), value=TRUE)
print(str_replace_all(sets, "nucleotidesFrag\\.|\\.noprimers",""))

# choose a set, in this case set five, Miya MiFish 12S.
myset <- sets[5]

# subset the master dataframe for records that have data for that primer set
reflib.sub <- reflib.orig %>% filter(!is.na(!!as.name(myset)))

# replace the master nucleotides and length columns with the one selected
reflib.sub %<>% mutate(nucleotides=!!as.name(myset), length=!!as.name(str_replace_all(myset,"nucleotides","length")))

# tidy and remove all other primer fragment columns
reflib.sub %<>% select(-contains("Frag"))

# covert and write out a fasta file using `tab2fas`
# uses the standard database id field ("dbid") as a label
# "dbid" is the GenBank GI number, or the BOLD processID number (may need to remove the trailing markercode to resolve using boldsystems.org)
# custom labels can be created with `mutate` using other fields and changing "namecol" argument
reflib.fas <- tab2fas(df=reflib.sub, seqcol="nucleotides", namecol="dbid")
write.FASTA(reflib.fas, file="references.fasta")

# optionally write out a reduced sized tabular format for easy reading
write_csv(reflib.sub, path="reduced-references.csv")
	#!/usr/bin/env Rscript
	# script to subset and convert a reference library into fasta format

	# load up libraries
	library("tidyverse")
	library("magrittr")
	library("ape")

	# load up the references using the `references-load.R` script
	# loads objects: uk.species.table, uk.species.table.common, reflib.orig
	source("https://raw.githubusercontent.com/boopsboops/reference-libraries/master/scripts/references-load.R")
	source("https://raw.githubusercontent.com/legalLab/protocols-scripts/master/scripts/tab2fas.R")

	# take a look at available primer sets in the dataset
	sets <- grep("nucleotidesFrag.+\\.noprimers", names(reflib.orig), value=TRUE)
	print(str_replace_all(sets, "nucleotidesFrag\\.\|\\.noprimers",""))

	# choose a set, in this case set five, Miya MiFish 12S.
	myset <- sets[5]

	# subset the master dataframe for records that have data for that primer set
	reflib.sub <- reflib.orig %>% filter(!is.na(!!as.name(myset)))

	# replace the master nucleotides and length columns with the one selected
	reflib.sub %<>% mutate(nucleotides=!!as.name(myset), length=!!as.name(str_replace_all(myset,"nucleotides","length")))

	# tidy and remove all other primer fragment columns
	reflib.sub %<>% select(-contains("Frag"))

	# covert and write out a fasta file using `tab2fas`
	# uses the standard database id field ("dbid") as a label
	# "dbid" is the GenBank GI number, or the BOLD processID number (may need to remove the trailing markercode to resolve using boldsystems.org)
	# custom labels can be created with `mutate` using other fields and changing "namecol" argument
	reflib.fas <- tab2fas(df=reflib.sub, seqcol="nucleotides", namecol="dbid")
	write.FASTA(reflib.fas, file="references.fasta")

	# optionally write out a reduced sized tabular format for easy reading
	write_csv(reflib.sub, path="reduced-references.csv")