Lisa K. Johnson johnsolk

## MMETSP_diginorm_streaming_v_not.csv

          
            assembly
            n_seqs
            smallest
            largest
            n_bases
            p_contigs_with_CRBB
            p_refs_with_CRBB
            reference_coverage

            
              SRR1300409.Trinity.streaming.fixed.fasta_no_orphans
              30605
              224
              4134
              19238075
              0.97543
              0.92568
              0.85853

            
              SRR1300409.streaming.fixed.fasta
              30457
              224
              4134
              19243907
              0.98276
              0.93851
              0.89009

            
              SRR1300409.Trinity.fixed.fasta
              30771
              224
              4210
              19279950
              0.98096
              0.94858
              0.89449

            
              SRR1300402.streaming.fixed.fasta
              35128
              224
              13921
              22716553
              0.9928
              0.98108
              0.94234

            
              SRR1300402.not.fixed.fasta
              35101
              224
              13528
              22584802
              0.99282
              0.98024
              0.93656

            
              SRR1300410.Trinity.streaming.fixed.fasta
              9316
              224
              4094
              5055897
              0.99141
              0.96962
              0.93397

            
              SRR1300410.Trinity.fixed.fasta
              9414
              224
              4096
              5077982
              0.98959
              0.97993
              0.94012

## formatting_annotations.sh
sed 's_|_-_g' uniprot_sprot.blastx.outfmt6.sig > uniprot_sprot.blastx.outfmt6.fixed.sig
cut -f 1,2,17 uniprot_sprot.blastx.outfmt6.fixed.sig > uniprot_sprot.blastx.outfmt6.fixed.genes.sig
touch uniprot_P_ast_annotations_mansour_hits100DE.tab
for i in $(cut -f 2 P_ast_RNAseq_25pptv35ppt_top100DE.tab); do grep ${i/_/-} uniprot_sprot.blastx.outfmt6.fixed.genes.sig >> uniprot_P_ast_annotations_mansour_hits100DE.tab; done
sed 's/\-/\_/1' uniprot_P_ast_annotations_mansour_hits100DE.tab > uniprot_P_ast_annotations_mansour_hits100DE.fixed.tab

## example_gff3_dammit_output
##gff-version 3.2.1
Transcript_0	transdecoder	CDS	1664	2062	.	+	.	ID=cds.Transcript_0|m.1;Parent=Transcript_0|m.1
Transcript_0	transdecoder	exon	1	2064	.	+	.	ID=Transcript_0|m.1.exon1;Parent=Transcript_0|m.1
Transcript_0	transdecoder	five_prime_UTR	1	1663	.	+	.	ID=Transcript_0|m.1.utr5p1;Parent=Transcript_0|m.1
Transcript_0	transdecoder	gene	1	2064	.	+	.	ID=Transcript_0|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29
Transcript_0	transdecoder	mRNA	1	2064	.	+	.	ID=Transcript_0|m.1;Parent=Transcript_0|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29
Transcript_0	transdecoder	three_prime_UTR	2063	2064	.	+	.	ID=Transcript_0|m.1.utr3p1;Parent=Transcript_0|m.1
Transcript_100002	transdecoder	CDS	1	1047	.	+	.	ID=cds.Transcript_100002|m.114344;Parent=Transcript_100002|m.114344
Transcript_100002	transdecoder	CDS	1411	1722	.	+	.	ID=cds.Transcript_100002|m.114346;Parent=Transcript_100002|m.114346
Transcript_100002

## fastq_to_fasta.py
# checks for redundancies in record headers
# converts fastq to fasta format
import screed

f = open('porecamp_killifish.fasta','wb')

for n,r in enumerate(screed.open('porecamp_killifish.fastq')):
  if r.name in s:
    continue
  else:

## ncbi_urlprediction.py
"""
From Luiz Irber.
Takes an SRA accession and determines the location of the .sra data file for automated or downloading.
Follows this format: ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{SRR|ERR|DRR}/<first 6 characters of accession>/<accession>/<accession>.sra
Format according to NCBI utility handbook: https://www.ncbi.nlm.nih.gov/books/NBK158899/
"""

def sra_url(accession):
    """Returns predicted URL given SRA accession as input."""
    accession = accession.upper()

## oysterriver_readnums
[ljcohen@dev-intel14 trim]$ for i in $(find /mnt/scratch/ljcohen/oysterriver/ -name "*left.fq"); do echo $i; done
/mnt/scratch/ljcohen/oysterriver/ERR1674585/trinity/ERR1674585.left.fq
/mnt/scratch/ljcohen/oysterriver/ERR489297/trinity/ERR489297.left.fq
/mnt/scratch/ljcohen/oysterriver/DRR036858/trinity/DRR036858.left.fq
/mnt/scratch/ljcohen/oysterriver/DRR069093/trinity/DRR069093.left.fq
/mnt/scratch/ljcohen/oysterriver/DRR053698/trinity/DRR053698.left.fq
/mnt/scratch/ljcohen/oysterriver/DRR031870/trinity/DRR031870.left.fq
/mnt/scratch/ljcohen/oysterriver/SRR2016923/trinity/SRR2016923.left.fq
/mnt/scratch/ljcohen/oysterriver/ERR058009/trinity/ERR058009.left.fq
/mnt/scratch/ljcohen/oysterriver/SRR2086412/trinity/SRR2086412.left.fq

## replace.sh
#replace IUPAC ambiguous characters in all fasta files in a directory
for i in $(ls *.fa); do sed -i -e 's/[YRWSKMDVHBX]/N/g' $i; done

## msu_hpcc_commands.sh
# delete block of jobs in queue all starting with specific digits in Job ID
qstat -u ljcohen | grep "472526" | cut -d "." -f1 | xargs qdel

## fasta_to_table.py
# From within a py3 virtualenv
# or
# sudo pip install screed
# source ~/bin/py3/bin/activate
import screed
with open("table.txt",'w') as fq:
  for r in screed.open('trinity.nema.full.fasta'):
    fq.write(r.name+"\n")

## mmetsp_loops.sh
for file in $(find . -name "*.fasta.dammit.gff3"); do echo $file; cut -f1 $file | uniq | wc -l; done > ~/MMETSP/assembly_evaluation_data/total_contigs


for file in $(find `pwd` -name "*.fasta.renamed.fasta.dammit.gff3");
do
  base=$(basename $file .renamed.fasta.dammit.gff3)
  new=$base.dammit.gff3
  echo cp $file /mnt/scratch/ljcohen/mmetsp_dammit/cp/gff3/$new
done
assembly	n_seqs	smallest	largest	n_bases	p_contigs_with_CRBB	p_refs_with_CRBB	reference_coverage
SRR1300409.Trinity.streaming.fixed.fasta_no_orphans	30605	224	4134	19238075	0.97543	0.92568	0.85853
SRR1300409.streaming.fixed.fasta	30457	224	4134	19243907	0.98276	0.93851	0.89009
SRR1300409.Trinity.fixed.fasta	30771	224	4210	19279950	0.98096	0.94858	0.89449
SRR1300402.streaming.fixed.fasta	35128	224	13921	22716553	0.9928	0.98108	0.94234
SRR1300402.not.fixed.fasta	35101	224	13528	22584802	0.99282	0.98024	0.93656
SRR1300410.Trinity.streaming.fixed.fasta	9316	224	4094	5055897	0.99141	0.96962	0.93397
SRR1300410.Trinity.fixed.fasta	9414	224	4096	5077982	0.98959	0.97993	0.94012
	sed 's_\|_-_g' uniprot_sprot.blastx.outfmt6.sig > uniprot_sprot.blastx.outfmt6.fixed.sig
	cut -f 1,2,17 uniprot_sprot.blastx.outfmt6.fixed.sig > uniprot_sprot.blastx.outfmt6.fixed.genes.sig
	touch uniprot_P_ast_annotations_mansour_hits100DE.tab
	for i in $(cut -f 2 P_ast_RNAseq_25pptv35ppt_top100DE.tab); do grep ${i/_/-} uniprot_sprot.blastx.outfmt6.fixed.genes.sig >> uniprot_P_ast_annotations_mansour_hits100DE.tab; done
	sed 's/\-/\_/1' uniprot_P_ast_annotations_mansour_hits100DE.tab > uniprot_P_ast_annotations_mansour_hits100DE.fixed.tab
	##gff-version 3.2.1
	Transcript_0 transdecoder CDS 1664 2062 . + . ID=cds.Transcript_0\|m.1;Parent=Transcript_0\|m.1
	Transcript_0 transdecoder exon 1 2064 . + . ID=Transcript_0\|m.1.exon1;Parent=Transcript_0\|m.1
	Transcript_0 transdecoder five_prime_UTR 1 1663 . + . ID=Transcript_0\|m.1.utr5p1;Parent=Transcript_0\|m.1
	Transcript_0 transdecoder gene 1 2064 . + . ID=Transcript_0\|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29
	Transcript_0 transdecoder mRNA 1 2064 . + . ID=Transcript_0\|m.1;Parent=Transcript_0\|g.1;Name=ORF%20Transcript_0%7Cg.1%20Transcript_0%7Cm.1%20type%3A3prime_partial%20len%3A134%20%28%2B%29
	Transcript_0 transdecoder three_prime_UTR 2063 2064 . + . ID=Transcript_0\|m.1.utr3p1;Parent=Transcript_0\|m.1
	Transcript_100002 transdecoder CDS 1 1047 . + . ID=cds.Transcript_100002\|m.114344;Parent=Transcript_100002\|m.114344
	Transcript_100002 transdecoder CDS 1411 1722 . + . ID=cds.Transcript_100002\|m.114346;Parent=Transcript_100002\|m.114346
	Transcript_100002
	# checks for redundancies in record headers
	# converts fastq to fasta format
	import screed

	f = open('porecamp_killifish.fasta','wb')

	for n,r in enumerate(screed.open('porecamp_killifish.fastq')):
	if r.name in s:
	continue
	else:
	"""
	From Luiz Irber.
	Takes an SRA accession and determines the location of the .sra data file for automated or downloading.
	Follows this format: ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{SRR\|ERR\|DRR}/<first 6 characters of accession>/<accession>/<accession>.sra
	Format according to NCBI utility handbook: https://www.ncbi.nlm.nih.gov/books/NBK158899/
	"""

	def sra_url(accession):
	"""Returns predicted URL given SRA accession as input."""
	accession = accession.upper()
	[ljcohen@dev-intel14 trim]$ for i in $(find /mnt/scratch/ljcohen/oysterriver/ -name "*left.fq"); do echo $i; done
	/mnt/scratch/ljcohen/oysterriver/ERR1674585/trinity/ERR1674585.left.fq
	/mnt/scratch/ljcohen/oysterriver/ERR489297/trinity/ERR489297.left.fq
	/mnt/scratch/ljcohen/oysterriver/DRR036858/trinity/DRR036858.left.fq
	/mnt/scratch/ljcohen/oysterriver/DRR069093/trinity/DRR069093.left.fq
	/mnt/scratch/ljcohen/oysterriver/DRR053698/trinity/DRR053698.left.fq
	/mnt/scratch/ljcohen/oysterriver/DRR031870/trinity/DRR031870.left.fq
	/mnt/scratch/ljcohen/oysterriver/SRR2016923/trinity/SRR2016923.left.fq
	/mnt/scratch/ljcohen/oysterriver/ERR058009/trinity/ERR058009.left.fq
	/mnt/scratch/ljcohen/oysterriver/SRR2086412/trinity/SRR2086412.left.fq
	#replace IUPAC ambiguous characters in all fasta files in a directory
	for i in $(ls *.fa); do sed -i -e 's/[YRWSKMDVHBX]/N/g' $i; done
	# delete block of jobs in queue all starting with specific digits in Job ID
	qstat -u ljcohen \| grep "472526" \| cut -d "." -f1 \| xargs qdel
	# From within a py3 virtualenv
	# or
	# sudo pip install screed
	# source ~/bin/py3/bin/activate
	import screed
	with open("table.txt",'w') as fq:
	for r in screed.open('trinity.nema.full.fasta'):
	fq.write(r.name+"\n")
	for file in $(find . -name "*.fasta.dammit.gff3"); do echo $file; cut -f1 $file \| uniq \| wc -l; done > ~/MMETSP/assembly_evaluation_data/total_contigs



	for file in $(find `pwd` -name "*.fasta.renamed.fasta.dammit.gff3");
	do
	base=$(basename $file .renamed.fasta.dammit.gff3)
	new=$base.dammit.gff3
	echo cp $file /mnt/scratch/ljcohen/mmetsp_dammit/cp/gff3/$new
	done