Skip to content

Instantly share code, notes, and snippets.

View fstrozzi's full-sized avatar

Francesco Strozzi fstrozzi

View GitHub Profile
Program call:
uniref90_subsample.fasta uniref90 mmseqs2_results.m8 tmp -s 4 --max-seqs 4000
MMseqs Version: be8f616d7cfd406880608e470a6c7fda45b04c50
Sub Matrix blosum62.out
Add backtrace false
Alignment mode 3
E-value threshold 0.001
Seq. Id Threshold 0
Coverage threshold 0
@fstrozzi
fstrozzi / Samples.txt
Created February 24, 2016 15:59
Samples.txt file example for coverage_stats.r
files name
Sample_WB-103.cov.hist.txt WB_103
Sample_WB-104.cov.hist.txt WB_104
Sample_WB-105.cov.hist.txt WB_105
Sample_WB-106.cov.hist.txt WB_106
Sample_WB-107.cov.hist.txt WB_107
Sample_WB-108.cov.hist.txt WB_108
Sample_WB-109.cov.hist.txt WB_109
Sample_WB-180.cov.hist.txt WB_180
Sample_WB-181.cov.hist.txt WB_181
@fstrozzi
fstrozzi / coverage_stats.r
Created February 24, 2016 15:58
Script to calculate coverage of target regions for exome sequencing
library(RColorBrewer)
rm(list=ls())
samples = read.table("Samples.txt",header=T,stringsAsFactors=F)
coverage = list()
cumulative_coverage = list()
for (i in 1:length(samples$files)) {
coverage[[i]] <- read.table(samples$files[i])
cumulative_coverage[[i]] <- 1-cumsum(coverage[[i]][,5])
}
@fstrozzi
fstrozzi / filter_vcf.py
Created February 10, 2016 11:57
PyVCF example using Cython and BGZIP compressed output
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import cyvcf
import Bio.bgzf
vcf_input = sys.argv[1]
@fstrozzi
fstrozzi / fastq_reader.scala
Created December 15, 2015 11:15
Example Scala reading FastQ
import net.sf.picard.fastq.FastqReader
import java.io.File
import scala.collection.JavaConversions._
object Reader extends App {
val file = new File(args(0))
val fr = new FastqReader(file)
printToFile(new File(args(1)))(p => {
for (seq <- fr.iterator()) {
val header = seq.getReadHeader.split(" ")
@fstrozzi
fstrozzi / biomaRt.r
Last active December 14, 2015 14:21
Example BiomaRt with R
library(biomaRt)
## ensembl=useMart("ensembl")
## listDatasets(ensembl) # show all the possible databases on Ensembl
ensembl = useEnsembl(biomart="ensembl",dataset="btaurus_gene_ensembl")
## listAttributes(ensembl) # show the attributes of the database
@fstrozzi
fstrozzi / gist:a6dd0daba816ea2fbd5d
Last active August 29, 2015 14:10
Using HTSJDK from JRuby to parse a BAM file and filter for number of mismatches per read
#!/usr/bin/env ruby
require "htsjdk-1.125.jar" # https://github.com/broadinstitute/picard/releases/download/1.125/picard-tools-1.125.zip
require "logger"
java_import "htsjdk.samtools.SAMFileReader"
java_import "htsjdk.samtools.SAMFileWriterFactory"
logger = Logger.new(STDOUT)
desc "quant GTF OUTPUTDIR BAM ", "Genes and transcripts quantification"
Bio::Ngs::Cufflinks::Quantification.new.thor_task(self, :quant) do |wrapper, task, gtf, outputdir, bam|
wrapper.params = task.options
wrapper.params = {"num-threads" => 6, "output-dir" => outputdir, "GTF" => gtf }
wrapper.run :arguments=>[bam], :separator => "="
end
class Trim
include Bio::Command::Wrapper
set_program Bio::Ngs::Utils.binary("fastx_trimmer")
use_aliases
add_option :first_base, :type => :numeric, :aliases => "-f", :desc => "First base to keep"
add_option :last_base, :type => :numeric, :aliases => "-l", :desc => "Last base to keep"
add_option :compress, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP"
add_option :input, :type => :string, :aliases => "-i", :desc => "Input FASTA/Q file", :collapse => true
add_option :output, :type => :string, :aliases => "-o", :desc => "Output FASTA/Q file", :collapse => true
add_option :trim, :type => :numeric, :aliases => "-t", :desc => "Trim N nucleotides from the end of the read"
def thor_task(klass, task_name, &block)
if program.nil?
warn "WARNING: no program is associated with #{class_name.upcase} task, does not make sense to create a thor task."
return nil
end
if klass
wrapper = self
klass.class_eval do
wrapper.options.each_pair do |name, opt|
method_option name, opt