Created
May 6, 2014 22:58
-
-
Save nimezhu/bdd565017ea591f5c2d0 to your computer and use it in GitHub Desktop.
fastq file test ( contributed by Yang Zhang )
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [ $# -lt 1 ] | |
then | |
echo "USAGE: test_contamination.sh <fastq_file>" | |
exit 0 | |
fi | |
# modify the following parameters as needed | |
Sample_Size=5000000 | |
Fastq_to_fasta_PATH=/home/zocean/Software/fastx_toolkit-0.0.13/src/fastq_to_fasta/fastq_to_fasta | |
BLAST_PATH=/home/zocean/Software/ncbi-blast-2.2.26+/bin/blastn | |
# sample fastq file | |
./fastqsampler.py $1 $Sample_Size >${1%.fastq}_sample_$Sample_Size.fastq | |
# convert fastq to fasta | |
$Fastq_to_fasta_PATH -n -v -i ${1%.fastq}_sample_$Sample_Size.fastq -o ${1%.fastq}_sample_$Sample_Size.fasta | |
## download Salmo Salar genome sequence from ncbi or you can download na nr database | |
## wget http://www.ncbi.nlm.nih.gov/Traces/wgs/?download=AGKD01.fasta.gz | |
## reference assembly papge | |
## http://www.ncbi.nlm.nih.gov/genome/assembly/313068/ | |
## build Salmo salar genome blast detabase | |
## /home/zocean/Software/ncbi-blast-2.2.26+/bin/makeblastdb -in 2011-10-25_Salmo_salar_WGS_AGKD01.fasta -parse_seqids -hash_index -out Salmo_salar_blast_database_AGKD01 -taxid 8030 -title Salmo_salar_WGS -input_type fasta -dbtype nucl | |
# do blast | |
$BLAST_PATH -query ${1%.fastq}_sample_$Sample_Size.fasta -db Salmo_salar_blast_database_AGKD01 -out blast.result -max_target_seqs 5 -num_threads 4 -outfmt "7 qacc sacc length evalue score pident" -lcase_masking | |
sed '/# BLAST/d' blast_result.txt | sed '/# Data/d' |sed '/# Field/d' | sed 's/^#\s//' | awk '{if(/Query/) {Num++;printf("%d\t%s",Num,$0);} else if(/DBRHHJN1/){printf("%d\t.\t.\t%s\n",Num,$0);} else {printf("%d\t%s\n",Num,$0);}}' >blast_parse.txt | |
awk 'BEGIN{num=0;sign=0}{if(sign==1) {print $0;sign=0;} if($1!=num) {num=$1;sign=1};}' blast_parse.txt | cut -f 5 | sed '/^$/d' | sed 's/^/gb|/;s/$/|/'> top1_ID.txt | |
# retrieve species ID | |
./retrieve_gb_by_acc.pl top1_ID.txt >top1_ID_retrieve_species.txt | |
cut -f 2 top1_ID_retrieve_species.txt | sort -k 1 | uniq -c | sed 's/\s\{1,\}//;s/\s/\t/' | sort -k 1 -n -r >pie_chart.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import random | |
import sys | |
random.seed() | |
#name of the input file (fasta or fastq) | |
#assumes input file is standard fasta/fastq format | |
fileName = sys.argv[1] | |
#number of sequences to subsample | |
numSeq = int(sys.argv[2]) | |
increment = 0 | |
#if it's a fasta file | |
if (fileName.find(".fasta") != -1): | |
increment = 2 | |
#else if it's a fastq file | |
elif (fileName.find(".fastq") != -1): | |
increment = 4 | |
#quit if neither | |
else: | |
sys.stdout.write("not a fasta/fastq file\n") | |
sys.exit() | |
FILE = open(fileName, 'r') | |
totalReads = list() | |
index = 0 | |
for line in FILE: | |
if(index % increment == 0): | |
totalReads.append(index/increment) | |
index += 1 | |
FILE.close() | |
if(len(totalReads) < numSeq): | |
sys.stdout.write("You only have "+str(len(totalReads))+" reads!\n") | |
sys.exit() | |
ttl = len(totalReads) | |
random.shuffle(totalReads) | |
totalReads = totalReads[0: numSeq] | |
totalReads.sort() | |
FILE = open(fileName, 'r') | |
listIndex = 0 | |
for i in range(0, ttl): | |
curRead = "" | |
for j in range(0, increment): | |
curRead += FILE.readline() | |
if (i == totalReads[listIndex]): | |
sys.stdout.write(curRead) | |
listIndex += 1 | |
if(listIndex == numSeq): | |
break | |
FILE.close() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# programmer: zocean | |
# usage: | |
# input: | |
# output: | |
use LWP::Simple; | |
my $esearch = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" . "db=nucleotide&usehistory=y&rettype=gb&retmode=text&seq_start=1&seq_stop=100&id="; | |
open(IN,$ARGV[0]) or die(); | |
#my $id= "ref|NM_001146472.1|"; | |
while(<IN>){ | |
chomp(); | |
my $id=$_; | |
my @p = split(/\|/,$id); | |
my $q=$p[1]; | |
my $sign=0; | |
my $esearch_result = get($esearch . $q); | |
my @esearch_table=split("\n",$esearch_result); | |
foreach $item(@esearch_table){ | |
$sign=0; | |
if($item=~/ORGANISM/){ | |
$item=~s/\s+ORGANISM\s+//; | |
print "Id:$id\tOrganism:$item\n"; | |
$sign=1;} | |
} | |
if ($sign=0){print "Id:$id\tOrganism:NOT FOUND\n";} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment