Joseph Hughes josephhughes

## README
ReplaceStopsWithGaps.pl is a perlscript written by Joseph Hughes, University of Glasgow

use this to remove stop codons from an alignment
typically, this would be done to calculate dN/dS in HYPHY
Usage:
perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta

use this to replace stop codons from the nucleotide alignment
the nucleotide and the peptide alignments are necessary

## README
SplitFastq.pl is a perlscript written by Joseph Hughes, university of Glasgow

Usage:
  perl SplitFastq.pl -in ourmultifastqfile

This script will split a file containing multiple fastq into separate fast files names according to the fast ID.
The script uses Bioperl.

## Clusters.txt
#Name	Seg1	Seg2	Seg3	Seg4	Seg5	Seg6	Seg7	Seg8	Seg9	Seg10
1-8FRA2008-27	15	4	16	1	11	3	1	19	1	14
1-8FRA2008-28	15	4	16	1	11	3	1	19	1	14
1-8FRA2008-29	15	4	16	1	11	3	1	19	1	14
10RSArrrr-10		?				?
11RSArrrr-11		15				6
12RSArrrr-12		?				?
13RSArrrr-13		?				?
14CAR1982-04	?	?	1	1	1	?	?	2	1	1
14POL2012-01	3	14	15	2	2	13	5	7	4	14

## parse_cdhit.pl
# use this to get the number of reads in each cluster
use strict;
use Getopt::Long;
use Bio::SeqIO;

my ($clstr,$result,$long,%clusters,$infile);
&GetOptions(
      'clstr:s'  =>\$clstr, #a cd-hit generated cluster file
	    'out:s'     => \$result, # a text file with the numbers of reads in each cluster
           );

## dna2FreqAndDistMat
require(ape)

dna2FreqAndDistMat<-function(dna,model=NULL){
  if(is.null(model)){ model <- c("raw")}
   #model must be one , "raw" is the default model
  allowed_models<-c("raw", "N", "TS", "TV", "JC69", "K80", "F81", "K81", "F84", "BH87", "T92", "TN93", "GG95", "logdet", "paralin", "indel", "indelblock")
  if(!any(allowed_models==model)){
    warning("You need to provide the correct model: raw, N, TS, TV, JC69, K80, F81, K81, F84, BH87, T92, TN93, GG95, logdet, paralin, indel, indelblock")
    return(NULL)
  }

## ReplaceStopWithRefCodonGaps.pl
#!/usr/bin/perl -w
#
# use this to remove stop codons from an alignment
# typically, this would be done to calculate dN/dS in HYPHY
# Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta -ref 104D5S1
# use this to replace stop codons from the nucleotide alignment with the codon of the reference
# the nucleotide and the peptide alignments are necessary and the name of the reference sequence
# the reference sequence needs to be in the nucleotide alignment


## alignScript.sh
#!/bin/bash
# ./alignScript.sh ref pair1 pair2 name
ref_name=$1
pair1=$2
pair2=$3
name=$4
bwa index $ref_name
bwa mem $ref_name $pair1 $pair2 > ${name}.sam
samtools sort ${name}.sam -o ${name}.bam
samtools index ${name}.bam

## pango_designation2json.py
import json
import argparse
import csv
import sys
# provide as input
# 1) the curation notes (tsv) (more extensive thant lineage_notes.txt, which only has lineage and description)
#  contains: Lineage  Rough number of SNPs  Example sequence  Active/ Unobserved/ Inactive  Designator  Size (roughly)  Description
# 2) full_alias_key.txt a file with the renames for the aliases (.txt): alias,lineage

# to do:

## RetrieveEmailFromPubmed
#!/usr/bin/perl -w
# A perlscript written by Joseph Hughes, University of Glasgow
# use this perl script to parse the email addressed from the affiliations in PubMed

use strict;
use LWP::Simple;

my ($query,@queries);
#Query the Journal of Virology from 2014 until the present (use 3000)
$query = 'journal+of+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';
	ReplaceStopsWithGaps.pl is a perlscript written by Joseph Hughes, University of Glasgow

	use this to remove stop codons from an alignment
	typically, this would be done to calculate dN/dS in HYPHY
	Usage:
	perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta

	use this to replace stop codons from the nucleotide alignment
	the nucleotide and the peptide alignments are necessary
	SplitFastq.pl is a perlscript written by Joseph Hughes, university of Glasgow

	Usage:
	perl SplitFastq.pl -in ourmultifastqfile

	This script will split a file containing multiple fastq into separate fast files names according to the fast ID.
	The script uses Bioperl.
	#Name Seg1 Seg2 Seg3 Seg4 Seg5 Seg6 Seg7 Seg8 Seg9 Seg10
	1-8FRA2008-27 15 4 16 1 11 3 1 19 1 14
	1-8FRA2008-28 15 4 16 1 11 3 1 19 1 14
	1-8FRA2008-29 15 4 16 1 11 3 1 19 1 14
	10RSArrrr-10 ? ?
	11RSArrrr-11 15 6
	12RSArrrr-12 ? ?
	13RSArrrr-13 ? ?
	14CAR1982-04 ? ? 1 1 1 ? ? 2 1 1
	14POL2012-01 3 14 15 2 2 13 5 7 4 14
	# use this to get the number of reads in each cluster
	use strict;
	use Getopt::Long;
	use Bio::SeqIO;

	my ($clstr,$result,$long,%clusters,$infile);
	&GetOptions(
	'clstr:s' =>\$clstr, #a cd-hit generated cluster file
	'out:s' => \$result, # a text file with the numbers of reads in each cluster
	);
	require(ape)

	dna2FreqAndDistMat<-function(dna,model=NULL){
	if(is.null(model)){ model <- c("raw")}
	#model must be one , "raw" is the default model
	allowed_models<-c("raw", "N", "TS", "TV", "JC69", "K80", "F81", "K81", "F84", "BH87", "T92", "TN93", "GG95", "logdet", "paralin", "indel", "indelblock")
	if(!any(allowed_models==model)){
	warning("You need to provide the correct model: raw, N, TS, TV, JC69, K80, F81, K81, F84, BH87, T92, TN93, GG95, logdet, paralin, indel, indelblock")
	return(NULL)
	}
	#!/usr/bin/perl -w
	#
	# use this to remove stop codons from an alignment
	# typically, this would be done to calculate dN/dS in HYPHY
	# Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta -ref 104D5S1
	# use this to replace stop codons from the nucleotide alignment with the codon of the reference
	# the nucleotide and the peptide alignments are necessary and the name of the reference sequence
	# the reference sequence needs to be in the nucleotide alignment
	#!/bin/bash
	# ./alignScript.sh ref pair1 pair2 name
	ref_name=$1
	pair1=$2
	pair2=$3
	name=$4
	bwa index $ref_name
	bwa mem $ref_name $pair1 $pair2 > ${name}.sam
	samtools sort ${name}.sam -o ${name}.bam
	samtools index ${name}.bam
	import json
	import argparse
	import csv
	import sys
	# provide as input
	# 1) the curation notes (tsv) (more extensive thant lineage_notes.txt, which only has lineage and description)
	# contains: Lineage Rough number of SNPs Example sequence Active/ Unobserved/ Inactive Designator Size (roughly) Description
	# 2) full_alias_key.txt a file with the renames for the aliases (.txt): alias,lineage

	# to do:
	#!/usr/bin/perl -w
	# A perlscript written by Joseph Hughes, University of Glasgow
	# use this perl script to parse the email addressed from the affiliations in PubMed

	use strict;
	use LWP::Simple;

	my ($query,@queries);
	#Query the Journal of Virology from 2014 until the present (use 3000)
	$query = 'journal+of+virology[journal]+AND+2014[Date+-+Publication]:3000[Date+-+Publication]';