Jason Gallant jasongallant

## extractsequences.py
"""
%prog some.fasta wanted-list.txt
"""
from Bio import SeqIO
import sys

wanted = [line.strip() for line in open(sys.argv[2])]
seqiter = SeqIO.parse(open(sys.argv[1]), 'fasta')
SeqIO.write((seq for seq in seqiter if seq.id in wanted), sys.stdout, "fasta")

## countchars.sh
awk '
	BEGIN{print"in.var\tin.ancest\tout.var\tout.ancest\tninmissing\tnoutmissing"}
	NR <= 1 { next }
	{ for(i=1;i<=n=split($4,a,"");i++)  if((a[i]=="1"))   b++;
	   for(i=1;i<=n=split($4,a,"");i++)  if((a[i]=="0"))  c++;
	   for(i=1;i<=n=split($4,a,"");i++)  if((a[i]=="."))  x++;
	   for(i=1;i<=n=split($3,a,"");i++)  if((a[i]=="1"))  d++;
	   for(i=1;i<=n=split($3,a,"");i++)  if((a[i]=="0"))  e++;
	   for(i=1;i<=n=split($3,a,"");i++)  if((a[i]=="."))  y++;
	   printf "%s\n",$2"\t"FS"\t"b"\t"c"\t"d"\t"e"\t"x"\t"y;

## resequenceandsnp.sh
#WORKFLOW FOR GENOME RESEQUENCING:

#CONCATENATE CASAVA OUTPUT
# Annoying first step, Casava 1.8 splits files into groups of 4,000,000 reads, requiring you to stitch it back together...

cat *P1_GA1_*R1* > Limenitis_Pool_P1_GA1_R1.fastq.gz
cat *P1_GA1_*R2* > Limenitis_Pool_P1_GA1_R2.fastq.gz

#repeat for each group of files per each individual pool/sample

## illuminator.py
import os
import sys
import glob
import shutil
import argparse
import subprocess
import multiprocessing
import itertools

class FullPaths(argparse.Action):

## calcdistances.py
#!/usr/bin/env python
import sys

inputfile = sys.argv[1]

with open(inputfile) as fd:
	next(fd)
    	for line in fd:
        	columns=line.split()
        	#print columns

## cortex.sh
perl /projectnb/mullenl/programs/CORTEX_release_v1.0.5.15/scripts/calling/run_calls.pl \
--first_kmer 31 \
--last_kmer 61 \
--kmer_step 30 \
--fastaq_index index_file \
--auto_cleaning yes \
--bc yes \
--pd no \
--outdir ./cortexresults \
--outvcf cortextrial \

## findempty.sh
find  /path/to/dest -type d -empty
# find all empty files in /tmp directory
find  /tmp -type d -empty

## git-import-repository.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jasongallant
                / git-import-repository.md
            
            
              Created
              February 28, 2019 16:59
                — forked from martinbuberl/git-import-repository.md
            
              
                Import existing Git repository into another
              
          
    Import existing Git repository into another

Folder structure before (2 separate repositories):
XXX
 |- .git
 |- (project files)
YYY
 |- .git


## README.md

      
              2 files
            
          
              0 forks
            
          
              1 comment
            
          
              0 stars
            
          
                jasongallant
                / README.md
            
            
              Created
              March 26, 2021 01:24
                — forked from iracooke/README.md
            
              
                NCBI TSA Submission Guide
              
          
    Steps to submit to TSA

If you have a transcriptome that has been assembled from shotgun reads the TSA (Transcriptome Shotgun Assembly) database is a good place to put it so that it can be widely accessed.
This guide assumes that you simply want to submit the assembled sequences from your transcriptome without annotations. NCBI sets a high bar for inclusion of annotations so for most non-model organisms they are probably not going to meet the criteria.
To create a TSA submission take a look at the ncbi guidelines.  This gist is based on those guidelines.
Register BioProject
	"""
	%prog some.fasta wanted-list.txt
	"""
	from Bio import SeqIO
	import sys

	wanted = [line.strip() for line in open(sys.argv[2])]
	seqiter = SeqIO.parse(open(sys.argv[1]), 'fasta')
	SeqIO.write((seq for seq in seqiter if seq.id in wanted), sys.stdout, "fasta")
	awk '
	BEGIN{print"in.var\tin.ancest\tout.var\tout.ancest\tninmissing\tnoutmissing"}
	NR <= 1 { next }
	{ for(i=1;i<=n=split($4,a,"");i++) if((a[i]=="1")) b++;
	for(i=1;i<=n=split($4,a,"");i++) if((a[i]=="0")) c++;
	for(i=1;i<=n=split($4,a,"");i++) if((a[i]==".")) x++;
	for(i=1;i<=n=split($3,a,"");i++) if((a[i]=="1")) d++;
	for(i=1;i<=n=split($3,a,"");i++) if((a[i]=="0")) e++;
	for(i=1;i<=n=split($3,a,"");i++) if((a[i]==".")) y++;
	printf "%s\n",$2"\t"FS"\t"b"\t"c"\t"d"\t"e"\t"x"\t"y;
	#WORKFLOW FOR GENOME RESEQUENCING:

	#CONCATENATE CASAVA OUTPUT
	# Annoying first step, Casava 1.8 splits files into groups of 4,000,000 reads, requiring you to stitch it back together...

	cat P1_GA1_R1* > Limenitis_Pool_P1_GA1_R1.fastq.gz
	cat P1_GA1_R2* > Limenitis_Pool_P1_GA1_R2.fastq.gz

	#repeat for each group of files per each individual pool/sample
	import os
	import sys
	import glob
	import shutil
	import argparse
	import subprocess
	import multiprocessing
	import itertools

	class FullPaths(argparse.Action):
	#!/usr/bin/env python
	import sys

	inputfile = sys.argv[1]

	with open(inputfile) as fd:
	next(fd)
	for line in fd:
	columns=line.split()
	#print columns
	perl /projectnb/mullenl/programs/CORTEX_release_v1.0.5.15/scripts/calling/run_calls.pl \
	--first_kmer 31 \
	--last_kmer 61 \
	--kmer_step 30 \
	--fastaq_index index_file \
	--auto_cleaning yes \
	--bc yes \
	--pd no \
	--outdir ./cortexresults \
	--outvcf cortextrial \
	find /path/to/dest -type d -empty
	# find all empty files in /tmp directory
	find /tmp -type d -empty