Skip to content

Instantly share code, notes, and snippets.

View avrilcoghlan's full-sized avatar

Avril Coghlan avrilcoghlan

View GitHub Profile
@avrilcoghlan
avrilcoghlan / fix_embl_file.py
Created November 28, 2018 09:35
Script to mark some genes in an EMBL file as /pseudo
import sys
import os
from collections import defaultdict
#====================================================================#
# define a function to read in the genes that are in families, for our species of interest:
def find_genes_in_families(families_file, our_species_name, locus_tag):
"""read in the genes that are in families, for our species of interest """
@avrilcoghlan
avrilcoghlan / find_internal_stops.pl
Created November 28, 2018 09:28
Script to find protein sequences with internal stop codons
#!/usr/bin/env perl
=head1 NAME
find_internal_stops.pl
=head1 SYNOPSIS
find_internal_stops.pl input_fasta
where input_fasta is the input fasta file of protein translations.
@avrilcoghlan
avrilcoghlan / submit_crispresso_jobs_for_subsetsoffastq.py
Created October 26, 2018 12:09
Script to run CRISPResso jobs on a farm, for lots of subsets of data
import os
import sys
#====================================================================#
def submit_crispresso_jobs(sample_name, num_subsets):
# need to submit a crispresso job for each subset of the data:
for x in range(num_subsets):
subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
@avrilcoghlan
avrilcoghlan / filter_fastq_files_using_trimmomatic.py
Created October 26, 2018 12:06
Script to run Trimmomatic to discard read-pairs that have low quality bases
import os
import sys
#====================================================================#
def run_trimmomatic_for_subsets_of_data(sample_name, num_subsets):
# need to run trimmomatic for each subset of the data:
for x in range(num_subsets):
subset = x + 1 # eg. if num_subsets is 17, 'subset' goes from 1 to 17
@avrilcoghlan
avrilcoghlan / split_up_fastq.py
Created October 26, 2018 12:02
Script to split up a gzipped fastq file into smaller gzipped fastq files of 1 million reads each
import sys
import os
import gzip
from collections import defaultdict
#====================================================================#
# now read in the input fastq and split it up:
def read_fastq_file_and_split(input_fastq_file, seqs_per_output_file, output_file_prefix):
@avrilcoghlan
avrilcoghlan / calc_information_content_for_GO_terms.py
Last active April 16, 2018 12:55
Script to calculate the information content for GO terms for a species, given a file with the counts of annotations for each GO term, and the obo file
from collections import defaultdict
import sys
import os
import math
#====================================================================#
# define a function to record the children of each GO term in the GO hierarchy:
def read_go_children(input_go_obo_file):
@avrilcoghlan
avrilcoghlan / comparaFamiliesAnalysis.py
Created April 15, 2018 10:14
Script written by Diogo Ribeiro to identify gene family expansions in an in-house Compara database
#!/usr/bin/env python3
#25-Feb-2015 dr7
#Analysis suite for exploring Compara families. The purpose is to gather information about Compara trees to that we can mine this large set of information and select the most interesting families to study.
import sys
import os
import re
import gzip
import random
import pickle
@avrilcoghlan
avrilcoghlan / submit_crispresso_jobs.py
Created October 27, 2017 09:45
Python script to submit CRISPresso jobs on a compute farm
import sys
import os
from collections import defaultdict
#====================================================================#
# define a function to read in a list of files:
def read_file_list(input_file_list):
#!/usr/local/bin/perl
$fasta = $ARGV[0];
$gff = $ARGV[1];
# read in the gff file to find which sequences to take:
%TAKE = ();
$num_to_take = 0;
open(GFF,"$gff");
while(<GFF>)
@avrilcoghlan
avrilcoghlan / find_lca_of_go_terms.py
Created January 31, 2014 17:03
Python script to find last common ancestors of GO terms
import sys
import os
from collections import defaultdict
import calc_dists_to_top_of_GO
import calc_dists_to_top_of_GO_using_bfs
class Error (Exception): pass
#====================================================================#