Skip to content

Instantly share code, notes, and snippets.

View jasonsahl's full-sized avatar

Jason Sahl jasonsahl

  • Northern Arizona University
View GitHub Profile
@jasonsahl
jasonsahl / mashPy.py
Last active February 17, 2022 15:57
Create a distance dendrogram from MASH distances
#!/usr/bin/env python
#A python implementation of building clusters from MASH distances
import optparse
import sys
from optparse import OptionParser
try:
from scipy.cluster.hierarchy import weighted
import scipy.spatial.distance as ssd
@jasonsahl
jasonsahl / transpose_matrix.py
Created March 29, 2021 16:19
transpose a BSR matrix
#!/usr/bin/env python
import sys
from sys import argv
try:
out_matrix = open("transposed_matrix.matrix", "w")
reduced = []
with open(argv[1]) as my_file:
for line in my_file:
@jasonsahl
jasonsahl / parse_kmer_frequencies.py
Last active March 5, 2020 18:13
Parse Kmer frequencies from a Kmer matrix
#!/usr/bin/env python
"""parse frequencies from a Kmer matrix"""
from __future__ import division
import sys
import os
import optparse
from optparse import OptionParser
from collections import deque
@jasonsahl
jasonsahl / extract_PI_SNPs.py
Last active June 11, 2019 00:24
count and extract parsimony informative SNPs from a multi-fasta
#!/usr/bin/env python
"""retrieve only parsimony infomative
sites from a nucleotide multiple sequence alignment"""
from optparse import OptionParser
import sys
try:
from Bio import SeqIO
except:
@jasonsahl
jasonsahl / mlst_blast_assemblies.py
Last active November 1, 2018 18:02
Uses BLAST to perform MLST typing on genome assemblies
#!/usr/bin/env python
"""calculates MLST types from assemblies using BLAST.
If the gene is truncated, it will report a "T" and if
the gene has no blast hit, it will report a "X".
Your reference allele names must all end in "fasta"
and must contain a "_" between gene name and number.
The only external dependency is blast+ - tested version
is 2.2.31"""
@jasonsahl
jasonsahl / kallisto_wrapper.py
Created December 1, 2017 17:48
Kallisto read count wrapper
#!/usr/bin/env python
"""Read counts across a set of reference sequences.
Requires Python 2.7 to run"""
from __future__ import division
from __future__ import print_function
from optparse import OptionParser
import sys
import os
ERR319438
ERR360775
ERR360792
ERR360848
ERR360746
ERR360782
ERR360788
ERR360789
ERR360783
ERR360770
@jasonsahl
jasonsahl / transform_kallisto_bacseq.py
Created July 17, 2017 19:37
Transfrom content of Kallisto matrix, used for in silico ribotyping of C. difficile in this case
#!/usr/bin/env python
"""Transform ribotype data. Input matrix
is a transposed output from bac_seq"""
from __future__ import print_function
from __future__ import division
import sys
import os
import optparse
@jasonsahl
jasonsahl / sum_seq_length.py
Created August 11, 2016 17:17
Calculates all bases in a multi-FASTA file
#!/usr/bin/python
#parses sequence lengths from a file and prints them to the screen
#usage python seqlength.py infasta
from __future__ import print_function
from sys import argv
import sys
try:
from Bio import SeqIO
except:
print("script requires BioPython to run..exiting")
@jasonsahl
jasonsahl / find_outliers.r
Last active April 27, 2016 23:28
plot outliers in X and Y data
#The input is two colums: x and corresponding y values
require(MASS) ## for mvrnorm()
set.seed(1)
mine <- read.table("xy.txt")
mine <- data.frame(mine)
names(mine) <- c("X","Y")
plot(mine)
res <- resid(mod <- lm(Y ~ X, data = mine))
res.qt <- quantile(res, probs = c(0.001,0.999))
want <- which(res >= res.qt[1] & res <= res.qt[2])