Folder structure before (2 separate repositories):
XXX
|- .git
|- (project files)
YYY
|- .git
""" | |
%prog some.fasta wanted-list.txt | |
""" | |
from Bio import SeqIO | |
import sys | |
wanted = [line.strip() for line in open(sys.argv[2])] | |
seqiter = SeqIO.parse(open(sys.argv[1]), 'fasta') | |
SeqIO.write((seq for seq in seqiter if seq.id in wanted), sys.stdout, "fasta") |
awk ' | |
BEGIN{print"in.var\tin.ancest\tout.var\tout.ancest\tninmissing\tnoutmissing"} | |
NR <= 1 { next } | |
{ for(i=1;i<=n=split($4,a,"");i++) if((a[i]=="1")) b++; | |
for(i=1;i<=n=split($4,a,"");i++) if((a[i]=="0")) c++; | |
for(i=1;i<=n=split($4,a,"");i++) if((a[i]==".")) x++; | |
for(i=1;i<=n=split($3,a,"");i++) if((a[i]=="1")) d++; | |
for(i=1;i<=n=split($3,a,"");i++) if((a[i]=="0")) e++; | |
for(i=1;i<=n=split($3,a,"");i++) if((a[i]==".")) y++; | |
printf "%s\n",$2"\t"FS"\t"b"\t"c"\t"d"\t"e"\t"x"\t"y; |
#WORKFLOW FOR GENOME RESEQUENCING: | |
#CONCATENATE CASAVA OUTPUT | |
# Annoying first step, Casava 1.8 splits files into groups of 4,000,000 reads, requiring you to stitch it back together... | |
cat *P1_GA1_*R1* > Limenitis_Pool_P1_GA1_R1.fastq.gz | |
cat *P1_GA1_*R2* > Limenitis_Pool_P1_GA1_R2.fastq.gz | |
#repeat for each group of files per each individual pool/sample |
import os | |
import sys | |
import glob | |
import shutil | |
import argparse | |
import subprocess | |
import multiprocessing | |
import itertools | |
class FullPaths(argparse.Action): |
#!/usr/bin/env python | |
import sys | |
inputfile = sys.argv[1] | |
with open(inputfile) as fd: | |
next(fd) | |
for line in fd: | |
columns=line.split() | |
#print columns |
perl /projectnb/mullenl/programs/CORTEX_release_v1.0.5.15/scripts/calling/run_calls.pl \ | |
--first_kmer 31 \ | |
--last_kmer 61 \ | |
--kmer_step 30 \ | |
--fastaq_index index_file \ | |
--auto_cleaning yes \ | |
--bc yes \ | |
--pd no \ | |
--outdir ./cortexresults \ | |
--outvcf cortextrial \ |
find /path/to/dest -type d -empty | |
# find all empty files in /tmp directory | |
find /tmp -type d -empty |
If you have a transcriptome that has been assembled from shotgun reads the TSA
(Transcriptome Shotgun Assembly) database is a good place to put it so that it can be widely accessed.
This guide assumes that you simply want to submit the assembled sequences from your transcriptome without annotations. NCBI sets a high bar for inclusion of annotations so for most non-model organisms they are probably not going to meet the criteria.
To create a TSA
submission take a look at the ncbi guidelines. This gist is based on those guidelines.