Created
January 3, 2010 16:35
-
-
Save n8agrin/268030 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -w | |
### Script for accessing SGD database via www. Produces 2 output files: ### | |
### 1) sgd.txt contains the gene name for the sequence according to SGD ### | |
### 2) blast.txt contains the webpage (html) turned into text ### | |
### Open the file the sequence is in (tab-delimited) ### | |
open SEQ,"<sequence.txt" || die; | |
### Put the file into an array ### | |
@ygenes = <SEQ>; | |
### Array contains lines of the file. Split each line into variables and submit to www ### | |
for ($i=0;$i<$#ygenes+1;$i++) { | |
@input = split /\s/,$ygenes[$i]; | |
print "$i Getting gene name for $input[0]\n"; | |
open OUT, ">blast.txt" || die; | |
### Access the SGD database via the www ### | |
use LWP::UserAgent; | |
my $url = "http://genome-www2.stanford.edu/cgi-bin/SGD/nph-blast2sgd/10447?seqname=&filename=&sequence=$input[2]&program=blastn&database=YeastORF-N&.submit=Run+BLAST&filtop=default&output=gapped&matrix=BLOSUM62&sthr=default&wordlength=defaultðr=default&showal=100&sortop=pvalue&email=&sendhtml=URL"; | |
$browser = LWP::UserAgent->new(); | |
$browser->timeout(10); | |
my $request = HTTP::Request->new(GET => $url); | |
my $response = $browser->request($request); | |
until ($response->is_success()){ | |
$response = $browser->request($request); | |
if ($response->is_error()) {printf "%s\n", $response->status_line;} | |
} | |
$content = $response->content(); | |
### Create parser object ### | |
my $parser = HTML::Parser->new(api_version=>3, | |
text_h=>[\&textElem, 'text'] | |
); | |
### Parse object. ### | |
$parser->parse($content); | |
sub textElem | |
{ | |
my $text = shift; | |
print OUT "$text"; | |
} | |
### Use reg. expressions to pick out what you want from each webpage ### | |
open FORMAT, ">>sgd.txt" || die; | |
open IN ,"<blast.txt" || die; | |
while (<IN>){ | |
s/\s//g; | |
(1../ScoreP\(N\)N/i) and next; | |
/^\s*$/ and next; | |
@line = split //, $_; | |
for ($j=0;$j<7;$j++){ | |
print FORMAT "$line[$j]"; | |
} | |
print FORMAT "\t$input[0]\t$input[1]\t$input[2]\n"; | |
last; | |
} | |
close IN; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment