Daniel E Cook danielecook

## gtf2gffpipe.py
#!/usr/bin/python
# Slightly adapted from http://blog.nextgenetics.net/?e=27

import sys

def main():
    for line in sys.stdin.xreadlines():
      #skip comment lines that start with the '#' character
      if line[0] != '#':
        #split line into columns by tab

## sra_to_fastq.sh
Download_SRP_Runs() {
    SRP_IDs=`esearch -db sra -query $1 | efetch -format docsum | xtract -pattern DocumentSummary -element Run@acc | tr '\t' '\n'`
    for r in ${SRP_IDs}; do
        url="ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/${r:0:6}/${r}/${r}.sra"
        wget $url
    done;
}

Download_SRP_Runs <SRP ID GOES HERE>

## wormbase_245_gff_to_vcf.py
import os, gzip

GFF_URL = "ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz"
BUILD = re.search("WS[0-9]+",GFF_URL).group(0)

if not os.path.isfile("c_elegans.{BUILD}.annotations.gff3.gz".format(BUILD=BUILD)):
	print "Downloading Annotation File"
	os.system("curl 'ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz' > c_elegans.{BUILD}.annotations.gff3.gz".format(BUILD=BUILD))

acceptable_types = ['SNP', 'point_mutation']

## rename_bcf_samples.sh
function rename_to_filename {
    # Renames samples with the filename.
    tmp=`mktemp -t temp`
    echo ${1/.[vb]cf/} > $tmp
    bcftools reheader -s $tmp $1 > m.$1
    mv m.$1 $1
    bcftools index $1
}

function add_sample_prefix {

## vcf_downgrade.sh
# If you are trying to view VCF 4.2 files in IGV - you may run into issues. This function might help you.
# This script will:
# 1. Rename the file as version 4.1
# 2. Replace parentheses in the INFO lines (IGV doesn't like these!)

function vcf_downgrade() {
  outfile=${1/.bcf/}
  outfile=${outfile/.gz/}
  outfile=${outfile/.vcf/}
  bcftools view --max-alleles 2 -O v $1 | \

## WS245 annotations.sh
# Download wormbase gff file
curl 'ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz' > c_elegans.WS245.annotations.gff3.gz

# Use gff parallelized tools:  brew install dmd
# Extract each type into its own GFF File

# This list obtained by running:
# gunzip -kfc c_elegans.WS245.annotations.gff3.gz | cut -f 3 | sort | uniq
types="CDS
DNAseI_hypersensitive_site

## slurm_last_job.py
def SLURM_get_last_job():
	jobid = Popen("squeue --user=$USER -o %i -h -S -i | head -n 1", stdout=PIPE, shell=True).communicate()[0]
	jobid = jobid.strip()
	if jobid == "":
		return ""
	else:
		return jobid

## chunk_genome.py
def chunk_genome(chunk_size, reference):
    """
    Parses bwa .ann file to retrieve chromosome sizes
    for chunking purposes
    """
    ann = open(reference + ".ann").read()
    # Parsing .ann files
    contigs = [x.split(" ")[1] for x in ann.split("\n")[1:-1:1]][::2]
    contig_sizes = map(int,[x.split(" ")[1] for x in ann.split("\n")[1:-1:1]][1::2])
    for chrom, size in zip(contigs, contig_sizes):

## FastQC_aggregate.sh
# Run this script in a directory containing zip files from fastqc. It aggregates images of each type in individual folders
# So looking across data is quick.

zips=`ls *.zip`

for i in $zips; do
    unzip -o $i &>/dev/null;
done

fastq_folders=${zips/.zip/}

## process_gff.py
import sys

current_feature = ""

for line in sys.stdin:
    feature = line.split("\t")[2]
    if feature != current_feature:
        f = file(feature + ".gff", "a+")
    f.write(line)
	#!/usr/bin/python
	# Slightly adapted from http://blog.nextgenetics.net/?e=27

	import sys

	def main():
	for line in sys.stdin.xreadlines():
	#skip comment lines that start with the '#' character
	if line[0] != '#':
	#split line into columns by tab
	Download_SRP_Runs() {
	SRP_IDs=`esearch -db sra -query $1 \| efetch -format docsum \| xtract -pattern DocumentSummary -element Run@acc \| tr '\t' '\n'`
	for r in ${SRP_IDs}; do
	url="ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/${r:0:6}/${r}/${r}.sra"
	wget $url
	done;
	}

	Download_SRP_Runs <SRP ID GOES HERE>
	import os, gzip

	GFF_URL = "ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz"
	BUILD = re.search("WS[0-9]+",GFF_URL).group(0)

	if not os.path.isfile("c_elegans.{BUILD}.annotations.gff3.gz".format(BUILD=BUILD)):
	print "Downloading Annotation File"
	os.system("curl 'ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz' > c_elegans.{BUILD}.annotations.gff3.gz".format(BUILD=BUILD))

	acceptable_types = ['SNP', 'point_mutation']
	function rename_to_filename {
	# Renames samples with the filename.
	tmp=`mktemp -t temp`
	echo ${1/.[vb]cf/} > $tmp
	bcftools reheader -s $tmp $1 > m.$1
	mv m.$1 $1
	bcftools index $1
	}

	function add_sample_prefix {
	# If you are trying to view VCF 4.2 files in IGV - you may run into issues. This function might help you.
	# This script will:
	# 1. Rename the file as version 4.1
	# 2. Replace parentheses in the INFO lines (IGV doesn't like these!)

	function vcf_downgrade() {
	outfile=${1/.bcf/}
	outfile=${outfile/.gz/}
	outfile=${outfile/.vcf/}
	bcftools view --max-alleles 2 -O v $1 \| \
	# Download wormbase gff file
	curl 'ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz' > c_elegans.WS245.annotations.gff3.gz

	# Use gff parallelized tools: brew install dmd
	# Extract each type into its own GFF File

	# This list obtained by running:
	# gunzip -kfc c_elegans.WS245.annotations.gff3.gz \| cut -f 3 \| sort \| uniq
	types="CDS
	DNAseI_hypersensitive_site
	def SLURM_get_last_job():
	jobid = Popen("squeue --user=$USER -o %i -h -S -i \| head -n 1", stdout=PIPE, shell=True).communicate()[0]
	jobid = jobid.strip()
	if jobid == "":
	return ""
	else:
	return jobid
	def chunk_genome(chunk_size, reference):
	"""
	Parses bwa .ann file to retrieve chromosome sizes
	for chunking purposes
	"""
	ann = open(reference + ".ann").read()
	# Parsing .ann files
	contigs = [x.split(" ")[1] for x in ann.split("\n")[1:-1:1]][::2]
	contig_sizes = map(int,[x.split(" ")[1] for x in ann.split("\n")[1:-1:1]][1::2])
	for chrom, size in zip(contigs, contig_sizes):
	# Run this script in a directory containing zip files from fastqc. It aggregates images of each type in individual folders
	# So looking across data is quick.

	zips=`ls *.zip`

	for i in $zips; do
	unzip -o $i &>/dev/null;
	done

	fastq_folders=${zips/.zip/}
	import sys

	current_feature = ""

	for line in sys.stdin:
	feature = line.split("\t")[2]
	if feature != current_feature:
	f = file(feature + ".gff", "a+")
	f.write(line)