Created
December 13, 2018 15:07
-
-
Save boopsboops/a1c790064fe0a14af5226d098645ca60 to your computer and use it in GitHub Desktop.
Format and subset a tabular reference library to fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# script to subset and convert a reference library into fasta format | |
# load up libraries | |
library("tidyverse") | |
library("magrittr") | |
library("ape") | |
# load up the references using the `references-load.R` script | |
# loads objects: uk.species.table, uk.species.table.common, reflib.orig | |
source("https://raw.githubusercontent.com/boopsboops/reference-libraries/master/scripts/references-load.R") | |
source("https://raw.githubusercontent.com/legalLab/protocols-scripts/master/scripts/tab2fas.R") | |
# take a look at available primer sets in the dataset | |
sets <- grep("nucleotidesFrag.+\\.noprimers", names(reflib.orig), value=TRUE) | |
print(str_replace_all(sets, "nucleotidesFrag\\.|\\.noprimers","")) | |
# choose a set, in this case set five, Miya MiFish 12S. | |
myset <- sets[5] | |
# subset the master dataframe for records that have data for that primer set | |
reflib.sub <- reflib.orig %>% filter(!is.na(!!as.name(myset))) | |
# replace the master nucleotides and length columns with the one selected | |
reflib.sub %<>% mutate(nucleotides=!!as.name(myset), length=!!as.name(str_replace_all(myset,"nucleotides","length"))) | |
# tidy and remove all other primer fragment columns | |
reflib.sub %<>% select(-contains("Frag")) | |
# covert and write out a fasta file using `tab2fas` | |
# uses the standard database id field ("dbid") as a label | |
# "dbid" is the GenBank GI number, or the BOLD processID number (may need to remove the trailing markercode to resolve using boldsystems.org) | |
# custom labels can be created with `mutate` using other fields and changing "namecol" argument | |
reflib.fas <- tab2fas(df=reflib.sub, seqcol="nucleotides", namecol="dbid") | |
write.FASTA(reflib.fas, file="references.fasta") | |
# optionally write out a reduced sized tabular format for easy reading | |
write_csv(reflib.sub, path="reduced-references.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment