Skip to content

Instantly share code, notes, and snippets.

View dnanto's full-sized avatar
⚗️
🝎

Daniel Antonio Negrón dnanto

⚗️
🝎
View GitHub Profile
@dnanto
dnanto / taxsumm.sh
Created May 16, 2019 01:51
download a sequence summary for a given NCBI Taxonomy identifier using EDirect
#!/usr/bin/env bash
# install edirect utils from here: ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/
taxid="$1"
db="${2:-nuccore}"
printf "extra\ttaxid\tslen\ttitle\n"
esearch \
-db "$db" \
@dnanto
dnanto / outfmt7.R
Last active May 17, 2019 20:27
R functions to read/parse BLAST+ output format mode 7, automatically extracting fields and metadata with support for multiple queries (uses tidyverse)
parse_outfmt7 <- function(lines)
{
comments <- lines[grep("#", lines)]
ver <- comments[grep("^# BLASTN", comments)] %>% str_split_fixed(" ", 2) %>% .[ , 2]
db <- comments[grep("^# Database: ", comments)] %>% str_split_fixed(" ", 3) %>% .[ , 3]
fields <- comments[grep("^# Fields: ", comments)] %>% str_split_fixed(" ", 3) %>% .[ , 3] %>% str_split(", ", simplify = T)
read_tsv(lines, comment = "#", col_names = fields) %>% mutate(ver = ver, db = db)
}
read_outfmt7 <- function(path)
@dnanto
dnanto / terminal_histogram.sh
Created August 12, 2019 21:57
plot a histogram on the terminal using data from stdin
#!/usr/bin/env bash
# based on: https://stackoverflow.com/a/2538846
for _ in $(seq 1 100); do echo $(( $RANDOM % 100 )); done | \
gnuplot -p -e "set terminal dumb; W=2; bin(x, w)=w*floor(x/w); plot '< cat -' using (bin(\$1, W)):(1.0) smooth freq with boxes"
@dnanto
dnanto / outfmt7.py
Last active September 8, 2019 18:25
python3 generator to parse BLAST+ output format mode 7, automatically extracting fields
from collections import OrderedDict
def parse_outfmt7(file):
fields = []
for line in map(str.strip, file):
if line.startswith("# Fields: "):
fields = line[10:].split(", ")
elif line and not line.startswith("#"):
yield OrderedDict(zip(fields, line.split("\t")))
@dnanto
dnanto / read_dist.R
Last active September 15, 2019 16:29
R function to read lower-triangle distance matrix where the first row is the number of objects and the first column correspnds to the names (uses tidyverse)
read_dist <- function(path)
{
lines <- read_lines(path)
n <- as.integer(lines[1])
str_split_fixed(tail(lines, -1), "\t", n) %>%
as.data.frame() %>%
column_to_rownames("V1") %>%
add_column(" ") %>%
set_names(rownames(.))
}
@dnanto
dnanto / btop.py
Last active September 21, 2019 16:35
decode mutations from BTOP string in python
import sys
# https://www.ncbi.nlm.nih.gov/books/NBK279682/#_cookbook_Traceback_operations_BTOP_
# BTOP operations consist of
# 1.) a number with a count of matching letters,
# 2.) two letters showing a mismatch (e.g., “AG” means A was replaced by G), or
# 3.) a dash (“-“) and a letter showing a gap.
def decode_btop(btop):
pos, digis, chars = 0, [], []
@dnanto
dnanto / btop.R
Created September 21, 2019 16:35
decode mutations from BTOP string in R (uses tidyverse)
decode_btop <- function(btop)
{
matches <- as.integer(str_extract_all(btop, "\\d+", simplify = T))
matches <- if (str_starts(btop, "\\d", negate = T)) c(0, matches) else matches
pos <- 0; n <- 0; m <- 0; muts <- list();
for (ele in Filter(nchar, str_split(btop, "\\d+", simplify = T)))
{
pos <- pos + matches[n<-n+1]
for (i in seq(1, nchar(ele), 2)) muts[[m<-m+1]] <- list(pos = pos<-pos+1, mut = substr(ele, i, i+1))
}
@dnanto
dnanto / rbtop.R
Created December 31, 2019 22:07
calculate the reverse complement of a BTOP string (uses tidyverse)
comp <- set_names(
as.list(str_split("TAACGRYSWMKVHDBN", "", simplify = 1)),
str_split("ATUGCYRSWKMBDHVN", "", simplify = T)
)
revcomp_btop <- function(btop)
{
str_replace_all(btop, "([A-Z-])([A-Z-])", " \\2 \\1 ") %>%
str_split(" ", simplify = T) %>%
rev() %>%
@dnanto
dnanto / lsystem.py
Created January 10, 2020 05:41
L-system string generator in python...
#!/usr/bin/env python3
import sys
axiom = "0"
rules = { "0": "1[0]0", "1": "11" }
for i in range(int(sys.argv[1])):
axiom = "".join(rules.get(e, e) for e in axiom)
print(axiom)
@dnanto
dnanto / snp-sites-counts.R
Last active March 9, 2020 17:55
snp-sites VCF to SNP counts...
# mafft ncov.fna > msa.fna
# snp-sites -c -v msa.fna > snp.vcf
library(tidyverse)
lines <- read_lines("snp.vcf")
fields <- str_split(last(lines[startsWith(lines, "#")]), "\t")[[1]]
read_tsv(lines, comment = "#", col_names = fields, col_types = cols(.default = "c")) %>%
select(10:ncol(.)) %>%
mutate_all(as.integer) %>%
colSums() %>%
sort(decreasing = T) %>%