Skip to content

Instantly share code, notes, and snippets.

@n8agrin
Created January 3, 2010 16:35
Show Gist options
  • Save n8agrin/268030 to your computer and use it in GitHub Desktop.
Save n8agrin/268030 to your computer and use it in GitHub Desktop.
#!perl -w
### Script for accessing SGD database via www. Produces 2 output files: ###
### 1) sgd.txt contains the gene name for the sequence according to SGD ###
### 2) blast.txt contains the webpage (html) turned into text ###
### Open the file the sequence is in (tab-delimited) ###
open SEQ,"<sequence.txt" || die;
### Put the file into an array ###
@ygenes = <SEQ>;
### Array contains lines of the file. Split each line into variables and submit to www ###
for ($i=0;$i<$#ygenes+1;$i++) {
@input = split /\s/,$ygenes[$i];
print "$i Getting gene name for $input[0]\n";
open OUT, ">blast.txt" || die;
### Access the SGD database via the www ###
use LWP::UserAgent;
my $url = "http://genome-www2.stanford.edu/cgi-bin/SGD/nph-blast2sgd/10447?seqname=&filename=&sequence=$input[2]&program=blastn&database=YeastORF-N&.submit=Run+BLAST&filtop=default&output=gapped&matrix=BLOSUM62&sthr=default&wordlength=default&ethr=default&showal=100&sortop=pvalue&email=&sendhtml=URL";
$browser = LWP::UserAgent->new();
$browser->timeout(10);
my $request = HTTP::Request->new(GET => $url);
my $response = $browser->request($request);
until ($response->is_success()){
$response = $browser->request($request);
if ($response->is_error()) {printf "%s\n", $response->status_line;}
}
$content = $response->content();
### Create parser object ###
my $parser = HTML::Parser->new(api_version=>3,
text_h=>[\&textElem, 'text']
);
### Parse object. ###
$parser->parse($content);
sub textElem
{
my $text = shift;
print OUT "$text";
}
### Use reg. expressions to pick out what you want from each webpage ###
open FORMAT, ">>sgd.txt" || die;
open IN ,"<blast.txt" || die;
while (<IN>){
s/\s//g;
(1../ScoreP\(N\)N/i) and next;
/^\s*$/ and next;
@line = split //, $_;
for ($j=0;$j<7;$j++){
print FORMAT "$line[$j]";
}
print FORMAT "\t$input[0]\t$input[1]\t$input[2]\n";
last;
}
close IN;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment