Skip to content

Instantly share code, notes, and snippets.

@s4553711
Created June 16, 2013 16:39
Show Gist options
  • Save s4553711/adc214d303167eaca330 to your computer and use it in GitHub Desktop.
Save s4553711/adc214d303167eaca330 to your computer and use it in GitHub Desktop.
A simple program for submitting ncbi blast search in Java
package Bio;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.dom4j.Document;
import org.dom4j.io.SAXReader;
import org.dom4j.Element;
public class Blast {
private String blast_url;
private String query_db;
private String query_sp;
private String query_from;
private String query_to;
private String query_tar;
private String result;
private String rid;
private String query_status;
private String query_wait;
private String query_duration;
private Hashtable<String,Hashtable<String,String>> stor;
private int retry_i;
private String exception_str;
/*t1225039*/
public Blast(){
blast_url = "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi";
query_db = "";
query_sp = "";
query_from = "";
query_to = "";
query_tar = "";
result = "";
rid = "";
query_status = "";
query_wait = "";
query_duration = "";
stor = new Hashtable<String,Hashtable<String,String>>();
retry_i = 0;
exception_str = "";
}
public int set_db(String val){
if (val != ""){
query_db = val;
return 1;
} else{
return 0;
}
}
public int set_sp(String val){
if (val != ""){
query_sp = val;
return 1;
} else{
return 0;
}
}
public int set_from(String val){
if (val != ""){
query_from = val;
return 1;
} else{
return 0;
}
}
public int set_to(String val){
if (val != ""){
query_to = val;
return 1;
} else{
return 0;
}
}
public int set_tar(String val){
if (val != ""){
query_tar = val;
return 1;
} else{
return 0;
}
}
public String get_wait(){
return query_wait;
}
public String get_duration(){
return query_duration;
}
public String result_query(){
return result;
}
public void set_rid(String t){
rid = t;
}
public Hashtable<String,Hashtable<String,String>>get_result(){
return stor;
}
public String get_error_message(){
return exception_str;
}
public int get_retry(){
return retry_i;
}
public void clear()
{
exception_str = "";
retry_i = 0;
rid = "";
query_db = "";
query_from = "";
query_sp = "";
query_status = "";
query_tar = "";
query_to = "";
query_wait = "";
query_duration = "";
for(String s:stor.keySet()){
stor.get(s).clear();
}
stor.clear();
}
public int fetch_blast_result() throws java.lang.Exception{
try{
System.out.println("Log> Parsing RID: "+rid);
URI uri = new URI("http","www.ncbi.nlm.nih.gov","/blast/Blast.cgi",
"CMD=Get&RID="+rid+"&ALIGNMENTS=500&FORMAT_TYPE=XML",null);
URL url = uri.toURL();
// Setup for XML parser
SAXReader readerx = new SAXReader();
Document document = readerx.read(url.openStream());
// Get the length of original protein
String oLen = document.selectSingleNode("/BlastOutput/BlastOutput_query-len").getText();
// Get Hits
Iterator Hits = document.selectNodes("/BlastOutput/BlastOutput_iterations/Iteration/Iteration_hits/Hit").iterator();
// Hits Loop
while(Hits.hasNext()){
Element ele = (Element) Hits.next();
String tarNP = ele.selectSingleNode("./Hit_accession").getText();
String tarID = ele.selectSingleNode("./Hit_id").getText();
Iterator HspLists = ele.selectNodes("./Hit_hsps/Hsp").iterator();
while(HspLists.hasNext()){
Element ele_unit = (Element) HspLists.next();
// Unity
String tar_tax = "";
// Find if there is 6 consecutive residues
boolean query_match = false;
String middle_line = ele_unit.selectSingleNode("./Hsp_midline").getText();
String[] comp = middle_line.split("\\s|\\+");
for(String r :comp){
//System.out.println("Residues> "+r);
if (r.length() >= 6){
query_match = true;
}
}
// Match
if(query_match){
String OtherNP = ele.selectSingleNode("./Hit_def").getText();
for(String r: OtherNP.split(">")){
//System.out.println("GO_DEF> "+r);
// Get Species
if (tar_tax == ""){
Pattern pattern_find_tax = Pattern.compile("\\[(.*?)\\]");
Matcher matcher_tax = pattern_find_tax.matcher(r);
while(matcher_tax.find()){
tar_tax = matcher_tax.group(1);
}
//System.out.println("GO_TAX> "+tar_tax);
}
// Find AccessionID within this entry
Pattern pattern_name_group = Pattern.compile("gi\\|(.*?)\\|(.*?)\\|(.*?)\\.\\d\\|.*?\\[(.*?)\\]");
Matcher matcher_name_groups = pattern_name_group.matcher(r);
while(matcher_name_groups.find()){
if (!stor.containsKey(matcher_name_groups.group(3).toString())){
// tmp storage for each matched entry
Hashtable<String,String> tmp_stor = new Hashtable<String,String>();
tmp_stor.put("oLen",oLen);
tmp_stor.put("gi", matcher_name_groups.group(1).toString());
tmp_stor.put("tax", matcher_name_groups.group(4).toString());
tmp_stor.put("full", ele.selectSingleNode("./Hit_len").getText());
tmp_stor.put("align_len", ele_unit.selectSingleNode("./Hsp_align-len").getText());
tmp_stor.put("align_idn", ele_unit.selectSingleNode("./Hsp_identity").getText());
tmp_stor.put("align_pos", ele_unit.selectSingleNode("./Hsp_positive").getText());
tmp_stor.put("align_gap", ele_unit.selectSingleNode("./Hsp_gaps").getText());
tmp_stor.put("qfrom", ele_unit.selectSingleNode("./Hsp_query-from").getText());
tmp_stor.put("qto", ele_unit.selectSingleNode("./Hsp_query-to").getText());
tmp_stor.put("hfrom", ele_unit.selectSingleNode("./Hsp_hit-from").getText());
tmp_stor.put("hto", ele_unit.selectSingleNode("./Hsp_hit-to").getText());
tmp_stor.put("com_seq",ele_unit.selectSingleNode("./Hsp_midline").getText());
stor.put(matcher_name_groups.group(3).toString(),tmp_stor);
}
}
}
if (!stor.containsKey(tarNP)){
fetch_hsp(tarNP,tar_tax,tarID,ele,ele_unit,oLen);
}
}
}
}
System.out.println("Log> End parsing");
} catch (IOException e){
retry_i++;
if (retry_i < 3){
Thread.sleep(5000);
fetch_blast_result();
} else {
exception_str = "Blast_Connection_Error";
throw new java.lang.Exception("Result Fetch Error");
}
} catch (Exception e){
exception_str = "Blast_ResultFetch_Nnknow_Error";
throw new java.lang.Exception("Unknow Error");
}
return 1;
}
/*private boolean is_np(String tar){
boolean ret = false;
Pattern pattern = Pattern.compile("^NP");
Matcher matcher = pattern.matcher(tar);
while(matcher.find()) {
System.out.println("Match .. "+tar);
}
return ret;
}*/
private Hashtable<String,String> def_parse(String str){
Hashtable<String,String> ret = new Hashtable<String,String>();
Pattern pattr = Pattern.compile("gi\\|(.*?)\\|(.*?)\\|(.*?)\\.\\d");
Matcher matches = pattr.matcher(str);
while(matches.find()){
ret.put("gi", matches.group(1).toString());
ret.put("accn", matches.group(3).toString());
}
if (!ret.containsKey("gi")){
ret.put("gi", "");
}
if (!ret.containsKey("accn")){
ret.put("accn", "");
}
return ret;
}
private void fetch_hsp(String np, String tax, String gi, Element ele, Element ele_unit, String oLen){
// tmp storeage for each matched entry
Hashtable<String,String> tmp_stor = new Hashtable<String,String>();
Hashtable<String,String> def_info = def_parse(gi);
tmp_stor.put("oLen",oLen);
tmp_stor.put("gi", def_info.get("gi").toString());
tmp_stor.put("tax", tax);
tmp_stor.put("full", ele.selectSingleNode("./Hit_len").getText());
tmp_stor.put("align_len", ele_unit.selectSingleNode("./Hsp_align-len").getText());
tmp_stor.put("align_idn", ele_unit.selectSingleNode("./Hsp_identity").getText());
tmp_stor.put("align_pos", ele_unit.selectSingleNode("./Hsp_positive").getText());
tmp_stor.put("align_gap", ele_unit.selectSingleNode("./Hsp_gaps").getText());
tmp_stor.put("qfrom", ele_unit.selectSingleNode("./Hsp_query-from").getText());
tmp_stor.put("qto", ele_unit.selectSingleNode("./Hsp_query-to").getText());
tmp_stor.put("hfrom", ele_unit.selectSingleNode("./Hsp_hit-from").getText());
tmp_stor.put("hto", ele_unit.selectSingleNode("./Hsp_hit-to").getText());
tmp_stor.put("com_seq",ele_unit.selectSingleNode("./Hsp_midline").getText());
stor.put(np,tmp_stor);
}
public int check_status() throws java.lang.Exception {
if (rid != ""){
System.out.println("Log> Check RID: "+rid);
query_status = "";
query_wait = "";
try {
URL url = new URL(blast_url+"?CMD=Get&RID="+rid+"&ALIGNMENTS=500&email=lick@genetex.com&tool=java");
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
OutputStreamWriter wr = new OutputStreamWriter(connection.getOutputStream());
wr.flush();
// Get the response
String tmp_result = "";
BufferedReader rd = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line;
while ((line = rd.readLine()) != null) {
tmp_result += line;
}
rd.close();
wr.close();
// Find Job Status
Pattern pattern = Pattern.compile("<tr class=\"odd\"><td>Status</td><td>(.*?)</td></tr>");
Matcher matcher = pattern.matcher(tmp_result);
while (matcher.find()) {
query_status = matcher.group(1);
}
// Find Wait Time
Pattern pattern2 = Pattern.compile("<p class=\"WAITING\">This page will be automatically updated in <b>(.*?)</b> seconds</p>");
Matcher matcher2 = pattern2.matcher(tmp_result);
while (matcher2.find()) {
query_wait = matcher2.group(1);
}
// Find Wait Time
Pattern pattern3 = Pattern.compile("Time since submission</td><td>(\\S+)</td>");
Matcher matcher3 = pattern3.matcher(tmp_result);
while (matcher3.find()) {
query_duration = matcher3.group(1);
}
if (query_status.equals("Searching")){
return 0;
} else {
return 1;
}
} catch(Exception e){
System.out.println("Log> Status Check Retry .. "+retry_i);
retry_i++;
if (retry_i < 4){
Thread.sleep(10000);
check_status();
} else {
exception_str = "Blast_Connection_Error";
throw new java.lang.Exception("CheckStatus Error");
}
}
} else {
}
return 0;
}
public int run()throws java.lang.Exception {
// Argument Check
// if there is no giving target ID or query database name, return false.
if (query_sp == "" || query_sp == "all"){
query_sp = "";
}
if (query_tar == ""){
return 0;
}
if (query_db == ""){
return 0;
}
// This variables is used to store retrieve info
result = "";
rid = "";
String postData = "CMD=Put&PROGRAM=blastp&DATABASE="+query_db+"&QUERY="+query_tar+"&QUERY_FROM="+query_from+
"&QUERY_TO="+query_to+"&MATRIX_NANE=PAM30&EXPECT=200000&GAPCOSTS=9%201&ENTREZ_QUERY="+query_sp+
"&email=lick@genetex.com&tool=java";
try{
System.out.println("Log> Run Blast");
URL url = new URL(blast_url);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);
//connection.setDoInput(true);
//connection.setInstanceFollowRedirects(false);
connection.setRequestMethod("POST");
//connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
//connection.setRequestProperty("charset", "utf-8");
//connection.setRequestProperty("Content-Length", "" + Integer.toString(postData.getBytes().length));
//connection.setUseCaches (false);
OutputStreamWriter wr = new OutputStreamWriter(connection.getOutputStream());
wr.write(postData);
wr.flush();
// Get the response
BufferedReader rd = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String line;
while ((line = rd.readLine()) != null) {
// Process line..
result += line;
}
// Get RID
Pattern pattern = Pattern.compile("<input name=\"RID\" size=\"50\" type=\"text\" value=\"(\\S+)\" id=\"rid\" />");
Matcher matcher = pattern.matcher(result);
while (matcher.find()) {
rid = matcher.group(1);
}
//rid = "H98DRD9201R";
wr.close();
rd.close();
// If some problems arise from job execution, return the error message and job status as zero
if (rid == ""){
Pattern pattern_tmp = Pattern.compile("<ul id=\"msgR\" class=\"msg\"><li class=\"error\">"+
"<p class=\"error\">(.*?)</p></li></ul>");
Matcher matcher_tmp = pattern_tmp.matcher(result);
while(matcher_tmp.find()){
exception_str = matcher_tmp.group(1);
}
return 0;
}
}catch(Exception e){
System.out.println("Log> Submit Retry .. "+retry_i);
retry_i++;
if (retry_i < 4){
Thread.sleep(10000);
run();
} else {
exception_str = "Blast_Connection_Error";
throw new java.lang.Exception("Submit Error");
}
return 0;
}
return 1;
}
}
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.util.Hashtable;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.*;
import Bio.Blast;
public class Blast_Run {
private static FileWriter fwriter;
private static FileWriter logfwriter;
private static String[] col_schema;
private static boolean read_np;
private static int sleep;
public static String db;
public static String sp;
public static String mode;
public static String work_directory;
private static void set_read_np(boolean tar){
read_np = tar;
}
private static void set_sleep(int tar){
sleep = tar;
}
private static void set_format(String v){
if (v != ""){
mode = v;
}
}
private static void initial(){
try{
// Reading conf file
Properties prop = new Properties();
prop.load(new FileInputStream("config.properties"));
mode = "file";
set_read_np(Boolean.parseBoolean(prop.getProperty("is_np")));
set_sleep(Integer.parseInt(prop.getProperty("sleep")));
set_format(prop.getProperty("format"));
db = "refseq_protein";
sp = "";
work_directory = "";
} catch (Exception e){
System.out.println("Log> Error while loading config file");
}
}
public static void output(Hashtable<String,Hashtable<String,String>> stor, String[] input){
if (mode.equals("json")){
try{
JSONObject jsonObject = new JSONObject(stor);
fwriter.write(jsonObject.toString());
fwriter.flush();
} catch (Exception e){
System.out.println("Log> File writing error");
}
} else {
for(String tar: stor.keySet()){
if (read_np == true && !db.equals("swissprot")){
// Searching for NP Protein
boolean is_np = false;
Pattern pattern = Pattern.compile("^NP");
Matcher matcher = pattern.matcher(tar);
while(matcher.find()) {
is_np = true;
}
if (is_np == false) continue;
}
Hashtable<String,String> tmp_hash = stor.get(tar);
StringBuilder sb = new StringBuilder();
// Check if db is uniprot then select the target with the same length
if (db.equals("swissprot")){
if (tmp_hash.get("oLen").equals(tmp_hash.get("align_len"))){
if (tmp_hash.get("align_len").equals(tmp_hash.get("align_idn"))){
} else {
continue;
}
} else {
continue;
}
}
// Start output
if (input.length == 4){
sb.append(input[0]+"\t"+input[1]+"\t"+input[2]+"\t"+input[3]+"\t"+tar+"\t");
} else {
sb.append(input[0]+"\t"+input[1]+"\t"+tar+"\t");
}
for(String s:col_schema){
sb.append(tmp_hash.get(s).toString());
sb.append("\t");
}
try{
fwriter.write(sb.toString()+"\n");
fwriter.flush();
} catch (Exception e){
System.out.println("Log> File writing error");
}
}
}
}
public static void main(String[] args) {
System.out.println("Log> Program Satrt");
try{
col_schema = new String[]{"gi","tax","oLen","full","align_len","align_idn","align_pos","align_gap",
"qfrom","qto","hfrom","hto","com_seq"};
// Initialization
initial();
switch (args.length){
case 3:
work_directory = args[0].toString();
db = args[2].toString();
break;
case 4:
work_directory = args[0].toString();
db = args[2].toString();
sp = args[3].toString();
break;
default:
}
// Blast tool initialization
Blast bt = new Blast();
// Reading input file
FileInputStream fileInputStream = new FileInputStream(args[1]);
BufferedReader reader = new BufferedReader(new InputStreamReader( fileInputStream ));
// Setting output file, append the result if this file has existed
File saveFile = new File(work_directory+"Result.tab");
fwriter = new FileWriter(saveFile,true);
// Setting log file
File logFile = new File(work_directory+"Blast.log");
logfwriter = new FileWriter(logFile,true);
String line = "";
while((line = reader.readLine()) != null){
String[] input = line.split("\t");
String StartAA = "";
String StopAA = "";
if (input.length == 4){
StartAA = input[2].toString();
StopAA = input[3].toString();
}
System.out.println("Log> Process .. PJ: "+input[0]);
System.out.println("Log> Process .. NP: "+input[1]);
System.out.println("Log> Process .. StartAA: "+StartAA);
System.out.println("Log> Process .. StopAA: "+StopAA);
System.out.println("Log> Process .. DB: "+db);
System.out.println("Log> Process .. SP: "+sp);
System.out.println("Log> Process .. Mode: "+mode);
/*try {
bt.set_rid("8W92E1BF016");
bt.fetch_blast_result();
} catch (Exception e){
System.out.println("Log> Blast Error: "+bt.get_error_message());
logfwriter.write("Error\tConnectionError\t"+e.getMessage()+"\n");
logfwriter.flush();
}
output(bt.get_result(),input);*/
bt.set_db(db);
bt.set_sp(sp);
bt.set_tar(input[1].toString());
bt.set_from(StartAA);
bt.set_to(StopAA);
try {
int job_status = bt.run();
System.out.println("Log> Job Submitted");
if (job_status == 0){
System.out.println("Log> Blast Submit Error: "+bt.get_error_message());
logfwriter.write("Error\tSequenceError\t"+input[1]+"\t"+bt.get_error_message()+"\n");
logfwriter.flush();
bt.clear();
continue;
}
try{
while(bt.check_status() == 0){
System.out.println("Log> Again in "+sleep+"s - "+bt.get_duration());
Thread.sleep(sleep);
}
} catch (Exception e){
System.out.println("Log> Blast StatusCheck Error: "+bt.get_error_message());
logfwriter.write("Error\tConnectionError\t"+input[1]+"\n");
logfwriter.flush();
}
// Get the query result
try {
bt.fetch_blast_result();
} catch (Exception e){
System.out.println("Log> Blast Fetch Error: "+bt.get_error_message());
logfwriter.write("Error\tConnectionError\t"+input[1]+"\n");
logfwriter.flush();
}
// Output
output(bt.get_result(),input);
} catch (Exception e){
System.out.println("Log> Blast Submit Error: "+bt.get_error_message());
logfwriter.write("Error\tSubmitError\t"+input[1]+"\n");
logfwriter.flush();
}
bt.clear();
Thread.sleep(sleep);
}
fwriter.close();
logfwriter.close();
} catch (Exception e){
System.out.println("Log> System Exception "+e.getMessage());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment