Created
June 16, 2013 16:39
-
-
Save s4553711/adc214d303167eaca330 to your computer and use it in GitHub Desktop.
A simple program for submitting ncbi blast search in Java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Bio; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.io.OutputStreamWriter; | |
import java.net.HttpURLConnection; | |
import java.net.URI; | |
import java.net.URL; | |
import java.util.Hashtable; | |
import java.util.Iterator; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.dom4j.Document; | |
import org.dom4j.io.SAXReader; | |
import org.dom4j.Element; | |
public class Blast { | |
private String blast_url; | |
private String query_db; | |
private String query_sp; | |
private String query_from; | |
private String query_to; | |
private String query_tar; | |
private String result; | |
private String rid; | |
private String query_status; | |
private String query_wait; | |
private String query_duration; | |
private Hashtable<String,Hashtable<String,String>> stor; | |
private int retry_i; | |
private String exception_str; | |
/*t1225039*/ | |
public Blast(){ | |
blast_url = "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi"; | |
query_db = ""; | |
query_sp = ""; | |
query_from = ""; | |
query_to = ""; | |
query_tar = ""; | |
result = ""; | |
rid = ""; | |
query_status = ""; | |
query_wait = ""; | |
query_duration = ""; | |
stor = new Hashtable<String,Hashtable<String,String>>(); | |
retry_i = 0; | |
exception_str = ""; | |
} | |
public int set_db(String val){ | |
if (val != ""){ | |
query_db = val; | |
return 1; | |
} else{ | |
return 0; | |
} | |
} | |
public int set_sp(String val){ | |
if (val != ""){ | |
query_sp = val; | |
return 1; | |
} else{ | |
return 0; | |
} | |
} | |
public int set_from(String val){ | |
if (val != ""){ | |
query_from = val; | |
return 1; | |
} else{ | |
return 0; | |
} | |
} | |
public int set_to(String val){ | |
if (val != ""){ | |
query_to = val; | |
return 1; | |
} else{ | |
return 0; | |
} | |
} | |
public int set_tar(String val){ | |
if (val != ""){ | |
query_tar = val; | |
return 1; | |
} else{ | |
return 0; | |
} | |
} | |
public String get_wait(){ | |
return query_wait; | |
} | |
public String get_duration(){ | |
return query_duration; | |
} | |
public String result_query(){ | |
return result; | |
} | |
public void set_rid(String t){ | |
rid = t; | |
} | |
public Hashtable<String,Hashtable<String,String>>get_result(){ | |
return stor; | |
} | |
public String get_error_message(){ | |
return exception_str; | |
} | |
public int get_retry(){ | |
return retry_i; | |
} | |
public void clear() | |
{ | |
exception_str = ""; | |
retry_i = 0; | |
rid = ""; | |
query_db = ""; | |
query_from = ""; | |
query_sp = ""; | |
query_status = ""; | |
query_tar = ""; | |
query_to = ""; | |
query_wait = ""; | |
query_duration = ""; | |
for(String s:stor.keySet()){ | |
stor.get(s).clear(); | |
} | |
stor.clear(); | |
} | |
public int fetch_blast_result() throws java.lang.Exception{ | |
try{ | |
System.out.println("Log> Parsing RID: "+rid); | |
URI uri = new URI("http","www.ncbi.nlm.nih.gov","/blast/Blast.cgi", | |
"CMD=Get&RID="+rid+"&ALIGNMENTS=500&FORMAT_TYPE=XML",null); | |
URL url = uri.toURL(); | |
// Setup for XML parser | |
SAXReader readerx = new SAXReader(); | |
Document document = readerx.read(url.openStream()); | |
// Get the length of original protein | |
String oLen = document.selectSingleNode("/BlastOutput/BlastOutput_query-len").getText(); | |
// Get Hits | |
Iterator Hits = document.selectNodes("/BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit").iterator(); | |
// Hits Loop | |
while(Hits.hasNext()){ | |
Element ele = (Element) Hits.next(); | |
String tarNP = ele.selectSingleNode("./Hit_accession").getText(); | |
String tarID = ele.selectSingleNode("./Hit_id").getText(); | |
Iterator HspLists = ele.selectNodes("./Hit_hsps/Hsp").iterator(); | |
while(HspLists.hasNext()){ | |
Element ele_unit = (Element) HspLists.next(); | |
// Unity | |
String tar_tax = ""; | |
// Find if there is 6 consecutive residues | |
boolean query_match = false; | |
String middle_line = ele_unit.selectSingleNode("./Hsp_midline").getText(); | |
String[] comp = middle_line.split("\\s|\\+"); | |
for(String r :comp){ | |
//System.out.println("Residues> "+r); | |
if (r.length() >= 6){ | |
query_match = true; | |
} | |
} | |
// Match | |
if(query_match){ | |
String OtherNP = ele.selectSingleNode("./Hit_def").getText(); | |
for(String r: OtherNP.split(">")){ | |
//System.out.println("GO_DEF> "+r); | |
// Get Species | |
if (tar_tax == ""){ | |
Pattern pattern_find_tax = Pattern.compile("\\[(.*?)\\]"); | |
Matcher matcher_tax = pattern_find_tax.matcher(r); | |
while(matcher_tax.find()){ | |
tar_tax = matcher_tax.group(1); | |
} | |
//System.out.println("GO_TAX> "+tar_tax); | |
} | |
// Find AccessionID within this entry | |
Pattern pattern_name_group = Pattern.compile("gi\\|(.*?)\\|(.*?)\\|(.*?)\\.\\d\\|.*?\\[(.*?)\\]"); | |
Matcher matcher_name_groups = pattern_name_group.matcher(r); | |
while(matcher_name_groups.find()){ | |
if (!stor.containsKey(matcher_name_groups.group(3).toString())){ | |
// tmp storage for each matched entry | |
Hashtable<String,String> tmp_stor = new Hashtable<String,String>(); | |
tmp_stor.put("oLen",oLen); | |
tmp_stor.put("gi", matcher_name_groups.group(1).toString()); | |
tmp_stor.put("tax", matcher_name_groups.group(4).toString()); | |
tmp_stor.put("full", ele.selectSingleNode("./Hit_len").getText()); | |
tmp_stor.put("align_len", ele_unit.selectSingleNode("./Hsp_align-len").getText()); | |
tmp_stor.put("align_idn", ele_unit.selectSingleNode("./Hsp_identity").getText()); | |
tmp_stor.put("align_pos", ele_unit.selectSingleNode("./Hsp_positive").getText()); | |
tmp_stor.put("align_gap", ele_unit.selectSingleNode("./Hsp_gaps").getText()); | |
tmp_stor.put("qfrom", ele_unit.selectSingleNode("./Hsp_query-from").getText()); | |
tmp_stor.put("qto", ele_unit.selectSingleNode("./Hsp_query-to").getText()); | |
tmp_stor.put("hfrom", ele_unit.selectSingleNode("./Hsp_hit-from").getText()); | |
tmp_stor.put("hto", ele_unit.selectSingleNode("./Hsp_hit-to").getText()); | |
tmp_stor.put("com_seq",ele_unit.selectSingleNode("./Hsp_midline").getText()); | |
stor.put(matcher_name_groups.group(3).toString(),tmp_stor); | |
} | |
} | |
} | |
if (!stor.containsKey(tarNP)){ | |
fetch_hsp(tarNP,tar_tax,tarID,ele,ele_unit,oLen); | |
} | |
} | |
} | |
} | |
System.out.println("Log> End parsing"); | |
} catch (IOException e){ | |
retry_i++; | |
if (retry_i < 3){ | |
Thread.sleep(5000); | |
fetch_blast_result(); | |
} else { | |
exception_str = "Blast_Connection_Error"; | |
throw new java.lang.Exception("Result Fetch Error"); | |
} | |
} catch (Exception e){ | |
exception_str = "Blast_ResultFetch_Nnknow_Error"; | |
throw new java.lang.Exception("Unknow Error"); | |
} | |
return 1; | |
} | |
/*private boolean is_np(String tar){ | |
boolean ret = false; | |
Pattern pattern = Pattern.compile("^NP"); | |
Matcher matcher = pattern.matcher(tar); | |
while(matcher.find()) { | |
System.out.println("Match .. "+tar); | |
} | |
return ret; | |
}*/ | |
private Hashtable<String,String> def_parse(String str){ | |
Hashtable<String,String> ret = new Hashtable<String,String>(); | |
Pattern pattr = Pattern.compile("gi\\|(.*?)\\|(.*?)\\|(.*?)\\.\\d"); | |
Matcher matches = pattr.matcher(str); | |
while(matches.find()){ | |
ret.put("gi", matches.group(1).toString()); | |
ret.put("accn", matches.group(3).toString()); | |
} | |
if (!ret.containsKey("gi")){ | |
ret.put("gi", ""); | |
} | |
if (!ret.containsKey("accn")){ | |
ret.put("accn", ""); | |
} | |
return ret; | |
} | |
private void fetch_hsp(String np, String tax, String gi, Element ele, Element ele_unit, String oLen){ | |
// tmp storeage for each matched entry | |
Hashtable<String,String> tmp_stor = new Hashtable<String,String>(); | |
Hashtable<String,String> def_info = def_parse(gi); | |
tmp_stor.put("oLen",oLen); | |
tmp_stor.put("gi", def_info.get("gi").toString()); | |
tmp_stor.put("tax", tax); | |
tmp_stor.put("full", ele.selectSingleNode("./Hit_len").getText()); | |
tmp_stor.put("align_len", ele_unit.selectSingleNode("./Hsp_align-len").getText()); | |
tmp_stor.put("align_idn", ele_unit.selectSingleNode("./Hsp_identity").getText()); | |
tmp_stor.put("align_pos", ele_unit.selectSingleNode("./Hsp_positive").getText()); | |
tmp_stor.put("align_gap", ele_unit.selectSingleNode("./Hsp_gaps").getText()); | |
tmp_stor.put("qfrom", ele_unit.selectSingleNode("./Hsp_query-from").getText()); | |
tmp_stor.put("qto", ele_unit.selectSingleNode("./Hsp_query-to").getText()); | |
tmp_stor.put("hfrom", ele_unit.selectSingleNode("./Hsp_hit-from").getText()); | |
tmp_stor.put("hto", ele_unit.selectSingleNode("./Hsp_hit-to").getText()); | |
tmp_stor.put("com_seq",ele_unit.selectSingleNode("./Hsp_midline").getText()); | |
stor.put(np,tmp_stor); | |
} | |
public int check_status() throws java.lang.Exception { | |
if (rid != ""){ | |
System.out.println("Log> Check RID: "+rid); | |
query_status = ""; | |
query_wait = ""; | |
try { | |
URL url = new URL(blast_url+"?CMD=Get&RID="+rid+"&ALIGNMENTS=500&email=lick@genetex.com&tool=java"); | |
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); | |
connection.setDoOutput(true); | |
OutputStreamWriter wr = new OutputStreamWriter(connection.getOutputStream()); | |
wr.flush(); | |
// Get the response | |
String tmp_result = ""; | |
BufferedReader rd = new BufferedReader(new InputStreamReader(connection.getInputStream())); | |
String line; | |
while ((line = rd.readLine()) != null) { | |
tmp_result += line; | |
} | |
rd.close(); | |
wr.close(); | |
// Find Job Status | |
Pattern pattern = Pattern.compile("<tr class=\"odd\"><td>Status</td><td>(.*?)</td></tr>"); | |
Matcher matcher = pattern.matcher(tmp_result); | |
while (matcher.find()) { | |
query_status = matcher.group(1); | |
} | |
// Find Wait Time | |
Pattern pattern2 = Pattern.compile("<p class=\"WAITING\">This page will be automatically updated in <b>(.*?)</b> seconds</p>"); | |
Matcher matcher2 = pattern2.matcher(tmp_result); | |
while (matcher2.find()) { | |
query_wait = matcher2.group(1); | |
} | |
// Find Wait Time | |
Pattern pattern3 = Pattern.compile("Time since submission</td><td>(\\S+)</td>"); | |
Matcher matcher3 = pattern3.matcher(tmp_result); | |
while (matcher3.find()) { | |
query_duration = matcher3.group(1); | |
} | |
if (query_status.equals("Searching")){ | |
return 0; | |
} else { | |
return 1; | |
} | |
} catch(Exception e){ | |
System.out.println("Log> Status Check Retry .. "+retry_i); | |
retry_i++; | |
if (retry_i < 4){ | |
Thread.sleep(10000); | |
check_status(); | |
} else { | |
exception_str = "Blast_Connection_Error"; | |
throw new java.lang.Exception("CheckStatus Error"); | |
} | |
} | |
} else { | |
} | |
return 0; | |
} | |
public int run()throws java.lang.Exception { | |
// Argument Check | |
// if there is no giving target ID or query database name, return false. | |
if (query_sp == "" || query_sp == "all"){ | |
query_sp = ""; | |
} | |
if (query_tar == ""){ | |
return 0; | |
} | |
if (query_db == ""){ | |
return 0; | |
} | |
// This variables is used to store retrieve info | |
result = ""; | |
rid = ""; | |
String postData = "CMD=Put&PROGRAM=blastp&DATABASE="+query_db+"&QUERY="+query_tar+"&QUERY_FROM="+query_from+ | |
"&QUERY_TO="+query_to+"&MATRIX_NANE=PAM30&EXPECT=200000&GAPCOSTS=9%201&ENTREZ_QUERY="+query_sp+ | |
"&email=lick@genetex.com&tool=java"; | |
try{ | |
System.out.println("Log> Run Blast"); | |
URL url = new URL(blast_url); | |
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); | |
connection.setDoOutput(true); | |
//connection.setDoInput(true); | |
//connection.setInstanceFollowRedirects(false); | |
connection.setRequestMethod("POST"); | |
//connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); | |
//connection.setRequestProperty("charset", "utf-8"); | |
//connection.setRequestProperty("Content-Length", "" + Integer.toString(postData.getBytes().length)); | |
//connection.setUseCaches (false); | |
OutputStreamWriter wr = new OutputStreamWriter(connection.getOutputStream()); | |
wr.write(postData); | |
wr.flush(); | |
// Get the response | |
BufferedReader rd = new BufferedReader(new InputStreamReader(connection.getInputStream())); | |
String line; | |
while ((line = rd.readLine()) != null) { | |
// Process line.. | |
result += line; | |
} | |
// Get RID | |
Pattern pattern = Pattern.compile("<input name=\"RID\" size=\"50\" type=\"text\" value=\"(\\S+)\" id=\"rid\" />"); | |
Matcher matcher = pattern.matcher(result); | |
while (matcher.find()) { | |
rid = matcher.group(1); | |
} | |
//rid = "H98DRD9201R"; | |
wr.close(); | |
rd.close(); | |
// If some problems arise from job execution, return the error message and job status as zero | |
if (rid == ""){ | |
Pattern pattern_tmp = Pattern.compile("<ul id=\"msgR\" class=\"msg\"><li class=\"error\">"+ | |
"<p class=\"error\">(.*?)</p></li></ul>"); | |
Matcher matcher_tmp = pattern_tmp.matcher(result); | |
while(matcher_tmp.find()){ | |
exception_str = matcher_tmp.group(1); | |
} | |
return 0; | |
} | |
}catch(Exception e){ | |
System.out.println("Log> Submit Retry .. "+retry_i); | |
retry_i++; | |
if (retry_i < 4){ | |
Thread.sleep(10000); | |
run(); | |
} else { | |
exception_str = "Blast_Connection_Error"; | |
throw new java.lang.Exception("Submit Error"); | |
} | |
return 0; | |
} | |
return 1; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileWriter; | |
import java.io.InputStreamReader; | |
import java.util.Hashtable; | |
import java.util.Properties; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.json.*; | |
import Bio.Blast; | |
public class Blast_Run { | |
private static FileWriter fwriter; | |
private static FileWriter logfwriter; | |
private static String[] col_schema; | |
private static boolean read_np; | |
private static int sleep; | |
public static String db; | |
public static String sp; | |
public static String mode; | |
public static String work_directory; | |
private static void set_read_np(boolean tar){ | |
read_np = tar; | |
} | |
private static void set_sleep(int tar){ | |
sleep = tar; | |
} | |
private static void set_format(String v){ | |
if (v != ""){ | |
mode = v; | |
} | |
} | |
private static void initial(){ | |
try{ | |
// Reading conf file | |
Properties prop = new Properties(); | |
prop.load(new FileInputStream("config.properties")); | |
mode = "file"; | |
set_read_np(Boolean.parseBoolean(prop.getProperty("is_np"))); | |
set_sleep(Integer.parseInt(prop.getProperty("sleep"))); | |
set_format(prop.getProperty("format")); | |
db = "refseq_protein"; | |
sp = ""; | |
work_directory = ""; | |
} catch (Exception e){ | |
System.out.println("Log> Error while loading config file"); | |
} | |
} | |
public static void output(Hashtable<String,Hashtable<String,String>> stor, String[] input){ | |
if (mode.equals("json")){ | |
try{ | |
JSONObject jsonObject = new JSONObject(stor); | |
fwriter.write(jsonObject.toString()); | |
fwriter.flush(); | |
} catch (Exception e){ | |
System.out.println("Log> File writing error"); | |
} | |
} else { | |
for(String tar: stor.keySet()){ | |
if (read_np == true && !db.equals("swissprot")){ | |
// Searching for NP Protein | |
boolean is_np = false; | |
Pattern pattern = Pattern.compile("^NP"); | |
Matcher matcher = pattern.matcher(tar); | |
while(matcher.find()) { | |
is_np = true; | |
} | |
if (is_np == false) continue; | |
} | |
Hashtable<String,String> tmp_hash = stor.get(tar); | |
StringBuilder sb = new StringBuilder(); | |
// Check if db is uniprot then select the target with the same length | |
if (db.equals("swissprot")){ | |
if (tmp_hash.get("oLen").equals(tmp_hash.get("align_len"))){ | |
if (tmp_hash.get("align_len").equals(tmp_hash.get("align_idn"))){ | |
} else { | |
continue; | |
} | |
} else { | |
continue; | |
} | |
} | |
// Start output | |
if (input.length == 4){ | |
sb.append(input[0]+"\t"+input[1]+"\t"+input[2]+"\t"+input[3]+"\t"+tar+"\t"); | |
} else { | |
sb.append(input[0]+"\t"+input[1]+"\t"+tar+"\t"); | |
} | |
for(String s:col_schema){ | |
sb.append(tmp_hash.get(s).toString()); | |
sb.append("\t"); | |
} | |
try{ | |
fwriter.write(sb.toString()+"\n"); | |
fwriter.flush(); | |
} catch (Exception e){ | |
System.out.println("Log> File writing error"); | |
} | |
} | |
} | |
} | |
public static void main(String[] args) { | |
System.out.println("Log> Program Satrt"); | |
try{ | |
col_schema = new String[]{"gi","tax","oLen","full","align_len","align_idn","align_pos","align_gap", | |
"qfrom","qto","hfrom","hto","com_seq"}; | |
// Initialization | |
initial(); | |
switch (args.length){ | |
case 3: | |
work_directory = args[0].toString(); | |
db = args[2].toString(); | |
break; | |
case 4: | |
work_directory = args[0].toString(); | |
db = args[2].toString(); | |
sp = args[3].toString(); | |
break; | |
default: | |
} | |
// Blast tool initialization | |
Blast bt = new Blast(); | |
// Reading input file | |
FileInputStream fileInputStream = new FileInputStream(args[1]); | |
BufferedReader reader = new BufferedReader(new InputStreamReader( fileInputStream )); | |
// Setting output file, append the result if this file has existed | |
File saveFile = new File(work_directory+"Result.tab"); | |
fwriter = new FileWriter(saveFile,true); | |
// Setting log file | |
File logFile = new File(work_directory+"Blast.log"); | |
logfwriter = new FileWriter(logFile,true); | |
String line = ""; | |
while((line = reader.readLine()) != null){ | |
String[] input = line.split("\t"); | |
String StartAA = ""; | |
String StopAA = ""; | |
if (input.length == 4){ | |
StartAA = input[2].toString(); | |
StopAA = input[3].toString(); | |
} | |
System.out.println("Log> Process .. PJ: "+input[0]); | |
System.out.println("Log> Process .. NP: "+input[1]); | |
System.out.println("Log> Process .. StartAA: "+StartAA); | |
System.out.println("Log> Process .. StopAA: "+StopAA); | |
System.out.println("Log> Process .. DB: "+db); | |
System.out.println("Log> Process .. SP: "+sp); | |
System.out.println("Log> Process .. Mode: "+mode); | |
/*try { | |
bt.set_rid("8W92E1BF016"); | |
bt.fetch_blast_result(); | |
} catch (Exception e){ | |
System.out.println("Log> Blast Error: "+bt.get_error_message()); | |
logfwriter.write("Error\tConnectionError\t"+e.getMessage()+"\n"); | |
logfwriter.flush(); | |
} | |
output(bt.get_result(),input);*/ | |
bt.set_db(db); | |
bt.set_sp(sp); | |
bt.set_tar(input[1].toString()); | |
bt.set_from(StartAA); | |
bt.set_to(StopAA); | |
try { | |
int job_status = bt.run(); | |
System.out.println("Log> Job Submitted"); | |
if (job_status == 0){ | |
System.out.println("Log> Blast Submit Error: "+bt.get_error_message()); | |
logfwriter.write("Error\tSequenceError\t"+input[1]+"\t"+bt.get_error_message()+"\n"); | |
logfwriter.flush(); | |
bt.clear(); | |
continue; | |
} | |
try{ | |
while(bt.check_status() == 0){ | |
System.out.println("Log> Again in "+sleep+"s - "+bt.get_duration()); | |
Thread.sleep(sleep); | |
} | |
} catch (Exception e){ | |
System.out.println("Log> Blast StatusCheck Error: "+bt.get_error_message()); | |
logfwriter.write("Error\tConnectionError\t"+input[1]+"\n"); | |
logfwriter.flush(); | |
} | |
// Get the query result | |
try { | |
bt.fetch_blast_result(); | |
} catch (Exception e){ | |
System.out.println("Log> Blast Fetch Error: "+bt.get_error_message()); | |
logfwriter.write("Error\tConnectionError\t"+input[1]+"\n"); | |
logfwriter.flush(); | |
} | |
// Output | |
output(bt.get_result(),input); | |
} catch (Exception e){ | |
System.out.println("Log> Blast Submit Error: "+bt.get_error_message()); | |
logfwriter.write("Error\tSubmitError\t"+input[1]+"\n"); | |
logfwriter.flush(); | |
} | |
bt.clear(); | |
Thread.sleep(sleep); | |
} | |
fwriter.close(); | |
logfwriter.close(); | |
} catch (Exception e){ | |
System.out.println("Log> System Exception "+e.getMessage()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment