nimezhu/contamination_test.sh

## contamination_test.sh
#!/bin/bash

if [ $# -lt 1 ]
    then
    echo "USAGE: test_contamination.sh <fastq_file>"
    exit 0
fi

# modify the following parameters as needed
Sample_Size=5000000
Fastq_to_fasta_PATH=/home/zocean/Software/fastx_toolkit-0.0.13/src/fastq_to_fasta/fastq_to_fasta
BLAST_PATH=/home/zocean/Software/ncbi-blast-2.2.26+/bin/blastn

# sample fastq file
./fastqsampler.py $1 $Sample_Size >${1%.fastq}_sample_$Sample_Size.fastq

# convert fastq to fasta
$Fastq_to_fasta_PATH -n -v -i ${1%.fastq}_sample_$Sample_Size.fastq  -o ${1%.fastq}_sample_$Sample_Size.fasta

## download Salmo Salar genome sequence from ncbi or you can download na nr database
## wget http://www.ncbi.nlm.nih.gov/Traces/wgs/?download=AGKD01.fasta.gz
## reference assembly papge
## http://www.ncbi.nlm.nih.gov/genome/assembly/313068/

## build Salmo salar genome blast detabase
## /home/zocean/Software/ncbi-blast-2.2.26+/bin/makeblastdb -in 2011-10-25_Salmo_salar_WGS_AGKD01.fasta -parse_seqids -hash_index -out Salmo_salar_blast_database_AGKD01 -taxid 8030 -title Salmo_salar_WGS -input_type fasta -dbtype nucl

# do blast
$BLAST_PATH -query ${1%.fastq}_sample_$Sample_Size.fasta -db Salmo_salar_blast_database_AGKD01 -out blast.result -max_target_seqs 5 -num_threads 4 -outfmt "7 qacc sacc length evalue score pident" -lcase_masking


sed '/# BLAST/d' blast_result.txt | sed '/# Data/d' |sed '/# Field/d' | sed 's/^#\s//' | awk '{if(/Query/) {Num++;printf("%d\t%s",Num,$0);} else if(/DBRHHJN1/){printf("%d\t.\t.\t%s\n",Num,$0);} else {printf("%d\t%s\n",Num,$0);}}' >blast_parse.txt

awk 'BEGIN{num=0;sign=0}{if(sign==1) {print $0;sign=0;} if($1!=num) {num=$1;sign=1};}' blast_parse.txt | cut -f 5 | sed '/^$/d' | sed 's/^/gb|/;s/$/|/'> top1_ID.txt

# retrieve species ID
./retrieve_gb_by_acc.pl top1_ID.txt >top1_ID_retrieve_species.txt
cut -f 2 top1_ID_retrieve_species.txt | sort -k 1 | uniq -c | sed 's/\s\{1,\}//;s/\s/\t/' | sort -k 1 -n -r >pie_chart.txt

## fastqsampler.py
#!/usr/bin/env python
import random
import sys

random.seed()
#name of the input file (fasta or fastq)
#assumes input file is standard fasta/fastq format
fileName = sys.argv[1]
#number of sequences to subsample
numSeq = int(sys.argv[2])
increment = 0

#if it's a fasta file
if (fileName.find(".fasta") != -1):
    increment = 2
#else if it's a fastq file
elif (fileName.find(".fastq") != -1):
    increment = 4
#quit if neither
else:
    sys.stdout.write("not a fasta/fastq file\n")
    sys.exit()

FILE = open(fileName, 'r')
totalReads = list()
index = 0
for line in FILE:
    if(index % increment == 0):
        totalReads.append(index/increment)
    index += 1
FILE.close()
if(len(totalReads) < numSeq):
    sys.stdout.write("You only have "+str(len(totalReads))+" reads!\n")
    sys.exit()

ttl = len(totalReads)
random.shuffle(totalReads)
totalReads = totalReads[0: numSeq]
totalReads.sort()

FILE = open(fileName, 'r')
listIndex = 0

for i in range(0, ttl):
    curRead = ""
    for j in range(0, increment):
        curRead += FILE.readline()
    if (i == totalReads[listIndex]):
        sys.stdout.write(curRead)
        listIndex += 1
        if(listIndex == numSeq):
            break
FILE.close()


## retrieve_gb_by_acc.pl
#!/usr/bin/perl
# programmer: zocean
# usage:
# input:
# output:

use LWP::Simple;

my $esearch = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" . "db=nucleotide&usehistory=y&rettype=gb&retmode=text&seq_start=1&seq_stop=100&id=";
open(IN,$ARGV[0]) or die();
#my $id= "ref|NM_001146472.1|";
while(<IN>){
chomp();
my $id=$_;
my @p = split(/\|/,$id);
my $q=$p[1];
my $sign=0;
my $esearch_result = get($esearch . $q);
my @esearch_table=split("\n",$esearch_result);
foreach $item(@esearch_table){
    $sign=0;
    if($item=~/ORGANISM/){
        $item=~s/\s+ORGANISM\s+//;
        print "Id:$id\tOrganism:$item\n";
        $sign=1;}
}
if ($sign=0){print "Id:$id\tOrganism:NOT FOUND\n";}
}
	#!/bin/bash

	if [ $# -lt 1 ]
	then
	echo "USAGE: test_contamination.sh <fastq_file>"
	exit 0
	fi

	# modify the following parameters as needed
	Sample_Size=5000000
	Fastq_to_fasta_PATH=/home/zocean/Software/fastx_toolkit-0.0.13/src/fastq_to_fasta/fastq_to_fasta
	BLAST_PATH=/home/zocean/Software/ncbi-blast-2.2.26+/bin/blastn

	# sample fastq file
	./fastqsampler.py $1 $Sample_Size >${1%.fastq}_sample_$Sample_Size.fastq

	# convert fastq to fasta
	$Fastq_to_fasta_PATH -n -v -i ${1%.fastq}_sample_$Sample_Size.fastq -o ${1%.fastq}_sample_$Sample_Size.fasta

	## download Salmo Salar genome sequence from ncbi or you can download na nr database
	## wget http://www.ncbi.nlm.nih.gov/Traces/wgs/?download=AGKD01.fasta.gz
	## reference assembly papge
	## http://www.ncbi.nlm.nih.gov/genome/assembly/313068/

	## build Salmo salar genome blast detabase
	## /home/zocean/Software/ncbi-blast-2.2.26+/bin/makeblastdb -in 2011-10-25_Salmo_salar_WGS_AGKD01.fasta -parse_seqids -hash_index -out Salmo_salar_blast_database_AGKD01 -taxid 8030 -title Salmo_salar_WGS -input_type fasta -dbtype nucl

	# do blast
	$BLAST_PATH -query ${1%.fastq}_sample_$Sample_Size.fasta -db Salmo_salar_blast_database_AGKD01 -out blast.result -max_target_seqs 5 -num_threads 4 -outfmt "7 qacc sacc length evalue score pident" -lcase_masking


	sed '/# BLAST/d' blast_result.txt \| sed '/# Data/d' \|sed '/# Field/d' \| sed 's/^#\s//' \| awk '{if(/Query/) {Num++;printf("%d\t%s",Num,$0);} else if(/DBRHHJN1/){printf("%d\t.\t.\t%s\n",Num,$0);} else {printf("%d\t%s\n",Num,$0);}}' >blast_parse.txt

	awk 'BEGIN{num=0;sign=0}{if(sign==1) {print $0;sign=0;} if($1!=num) {num=$1;sign=1};}' blast_parse.txt \| cut -f 5 \| sed '/^$/d' \| sed 's/^/gb\|/;s/$/\|/'> top1_ID.txt

	# retrieve species ID
	./retrieve_gb_by_acc.pl top1_ID.txt >top1_ID_retrieve_species.txt
	cut -f 2 top1_ID_retrieve_species.txt \| sort -k 1 \| uniq -c \| sed 's/\s\{1,\}//;s/\s/\t/' \| sort -k 1 -n -r >pie_chart.txt
	#!/usr/bin/env python
	import random
	import sys

	random.seed()
	#name of the input file (fasta or fastq)
	#assumes input file is standard fasta/fastq format
	fileName = sys.argv[1]
	#number of sequences to subsample
	numSeq = int(sys.argv[2])
	increment = 0

	#if it's a fasta file
	if (fileName.find(".fasta") != -1):
	increment = 2
	#else if it's a fastq file
	elif (fileName.find(".fastq") != -1):
	increment = 4
	#quit if neither
	else:
	sys.stdout.write("not a fasta/fastq file\n")
	sys.exit()

	FILE = open(fileName, 'r')
	totalReads = list()
	index = 0
	for line in FILE:
	if(index % increment == 0):
	totalReads.append(index/increment)
	index += 1
	FILE.close()
	if(len(totalReads) < numSeq):
	sys.stdout.write("You only have "+str(len(totalReads))+" reads!\n")
	sys.exit()

	ttl = len(totalReads)
	random.shuffle(totalReads)
	totalReads = totalReads[0: numSeq]
	totalReads.sort()

	FILE = open(fileName, 'r')
	listIndex = 0

	for i in range(0, ttl):
	curRead = ""
	for j in range(0, increment):
	curRead += FILE.readline()
	if (i == totalReads[listIndex]):
	sys.stdout.write(curRead)
	listIndex += 1
	if(listIndex == numSeq):
	break
	FILE.close()
	#!/usr/bin/perl
	# programmer: zocean
	# usage:
	# input:
	# output:

	use LWP::Simple;

	my $esearch = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" . "db=nucleotide&usehistory=y&rettype=gb&retmode=text&seq_start=1&seq_stop=100&id=";
	open(IN,$ARGV[0]) or die();
	#my $id= "ref\|NM_001146472.1\|";
	while(<IN>){
	chomp();
	my $id=$_;
	my @p = split(/\\|/,$id);
	my $q=$p[1];
	my $sign=0;
	my $esearch_result = get($esearch . $q);
	my @esearch_table=split("\n",$esearch_result);
	foreach $item(@esearch_table){
	$sign=0;
	if($item=~/ORGANISM/){
	$item=~s/\s+ORGANISM\s+//;
	print "Id:$id\tOrganism:$item\n";
	$sign=1;}
	}
	if ($sign=0){print "Id:$id\tOrganism:NOT FOUND\n";}
	}