-
-
Save dnanto/c14cf8b15b547e9178dd39f5fa3b3dfe to your computer and use it in GitHub Desktop.
R function to read the NCBI BankIT Feature Table file format using the tidyverse package
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
read_ft <- function(file) { | |
# read tsv with a maximum of five columns | |
suppressWarnings( | |
read_tsv( | |
file, | |
col_names = c("start", "end", "name", "key", "val"), | |
col_types = cols(.default = "c") | |
) | |
) %>% | |
# associate rows with a feature accession and feature identifier number | |
mutate( | |
acc = ifelse(!is.na(start) & startsWith(start, ">"), start, NA), | |
id = cumsum(!is.na(name)) | |
) %>% | |
fill(acc, name, .direction = "down") %>% | |
# remove feature accession rows | |
filter(!(!is.na(start) & startsWith(start, ">"))) %>% | |
# store partial feature information | |
mutate( | |
p5 = startsWith(start, "<"), | |
p3 = startsWith(end, ">"), | |
start = as.integer(str_remove(start, "<")), | |
end = as.integer(str_remove(end, ">")) | |
) %>% | |
# split table by feature accession | |
split(.$acc) %>% | |
lapply(function(ele) { | |
list( | |
# feature name and coordinates | |
locus = ( | |
select(ele, id, name, start, end) %>% | |
filter(complete.cases(.)) | |
), | |
# qualifier key-value pairs | |
qual = ( | |
select(ele, id, key, val) %>% | |
filter(complete.cases(.)) | |
) | |
) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment