Last active
August 29, 2015 14:01
-
-
Save maulikkamdar/06c85b8555a543643f71 to your computer and use it in GitHub Desktop.
A Java program writted to query multiple data sources - Coexpress, Corum, Gene Ontology (Biological Processes, Cellular Compartments, Molecular Functions), Gene Co-occurrence (Publications, KEGG Pathways, OMIM Diseases), Protein Domain-domain interactions and Gene Ideogram-Ideogram Interactions, which are stored as RDF in a Virtuoso triple store…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.deri.ppi; | |
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.DataInputStream; | |
import java.io.FileInputStream; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.HttpURLConnection; | |
import java.net.URL; | |
import java.net.URLEncoder; | |
import java.util.HashMap; | |
/** | |
* Retrieves information from multiple data sources stored in Virtuoso as RDF | |
*/ | |
public class Interactors { | |
static final HashMap<String, String> geneList = new HashMap<String, String>(); | |
static final HashMap<String, String> goBPList = new HashMap<String, String>(); | |
static final HashMap<String, String> goCCList = new HashMap<String, String>(); | |
static final HashMap<String, String> goMFList = new HashMap<String, String>(); | |
/** | |
* @param args | |
*/ | |
public static void main(String[] args) throws IOException{ | |
//----- HGNC file which provides EntrezGene-HGNC ID conversion | |
FileInputStream hgncstream = new FileInputStream("hgncConvertor.tsv"); | |
DataInputStream hgncin = new DataInputStream(hgncstream); | |
BufferedReader hgncbr = new BufferedReader(new InputStreamReader(hgncin)); | |
String hgncIdline; | |
int genecount = 0; | |
while ((hgncIdline = hgncbr.readLine()) != null) { | |
String [] hgncParts = hgncIdline.split("\t"); | |
if(genecount++ > 0) { | |
if(hgncParts.length > 1) | |
geneList.put(hgncParts[1], hgncParts[0]); | |
} | |
} | |
// ---- Custom GO File generated which removes the parent nodes | |
FileInputStream gostream = new FileInputStream("constrictedGo_rel.tsv"); | |
DataInputStream goin = new DataInputStream(gostream); | |
BufferedReader gobr = new BufferedReader(new InputStreamReader(goin)); | |
String goline; | |
while ((goline = gobr.readLine()) != null) { | |
String [] goParts = goline.split("\t"); | |
if(goParts.length > 2) | |
goBPList.put(goParts[0], goParts[2]); | |
if(goParts.length > 3) | |
goCCList.put(goParts[0], goParts[3]); | |
if(goParts.length > 4) | |
goMFList.put(goParts[0], goParts[4]); | |
} | |
//- Test data for pairs of proteins of the format - EntrezID_A | UniprotID_A | EnsemblID_A | ChrStart_A | CHrStop_A | ChrLength_A | ChrNumber_A | ChrStrand_A | ... Repeat for B ... | Ideogram_A | Ideogram_B | Domains_A | Domains_B | |
FileInputStream stream = new FileInputStream("interactorsIdeogramDomainRef.tsv"); | |
DataInputStream in = new DataInputStream(stream); | |
BufferedReader br = new BufferedReader(new InputStreamReader(in)); | |
FileWriter foutstream = new FileWriter("interactorsIdeogramDomainRef_scores.tsv"); | |
BufferedWriter out = new BufferedWriter(foutstream); | |
String pathLine; | |
int count = 0; | |
while ((pathLine = br.readLine()) != null) { | |
String [] strParts = pathLine.split("\t"); | |
if(strParts.length < 20) | |
continue; | |
double ddScore = 0.0; | |
double ideoScore = 0.0; | |
double coExpressScore = 0.0; | |
int corumScore = 0, cooccurPub = 0, cooccurPath = 0, cooccurDis = 0; | |
int goBPScore = 0, goCCScore = 0, goMFScore = 0; | |
//--------------- Coexpress Processing here ------------------ // | |
/*PREFIX ppi: <http://data.bioinfo.deri.ie/> | |
PREFIX ns1: <http://coxpresdb.jp/rdf/def/0.1/> | |
PREFIX geneid: <http://bio2rdf.org/geneid:> | |
SELECT * WHERE {{ | |
?coexpressInt ns1:gene_id_1 geneid:24147; | |
ns1:gene_id_2 geneid:23556; | |
ns1:mutual_rank ?rank | |
} UNION { | |
?coexpressInt ns1:gene_id_2 geneid:3280; | |
ns1:gene_id_1 geneid:23556; | |
ns1:mutual_rank ?rank | |
} | |
}*/ | |
String coexpressQuery = "PREFIX ppi: <http://data.bioinfo.deri.ie/>" + | |
"PREFIX ns1: <http://coxpresdb.jp/rdf/def/0.1/>" + | |
"PREFIX geneid: <http://bio2rdf.org/geneid:>" + | |
"SELECT DISTINCT * WHERE {{" + | |
"?coexpressInt ns1:gene_id_1 geneid:"+strParts[0]+";ns1:gene_id_2 geneid:"+strParts[8]+";ns1:pcc ?pcc" + | |
"} UNION {" + | |
"?coexpressInt ns1:gene_id_2 geneid:"+strParts[8]+";ns1:gene_id_1 geneid:"+strParts[0]+";ns1:pcc ?pcc" + | |
"}}"; | |
String coexpressQueryStr = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(coexpressQuery, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String coexpressResult = httpGet(coexpressQueryStr); | |
String [] ceresultParts = coexpressResult.split("\t"); | |
if(ceresultParts.length > 2) { | |
coExpressScore = Double.parseDouble(ceresultParts[3]); | |
} | |
// -------------------------- Corum Complexes processing ------- // | |
//<http://data.bioinfo.deri.ie/CORUM/1> <http://data.bioinfo.deri.ie/componets> <http://purl.uniprot.org/uniprot/P41182> , <http://purl.uniprot.org/uniprot/P56524> | |
String corumQuery = "SELECT (COUNT (DISTINCT ?s) AS ?complexes) WHERE {" + | |
"?s <http://data.bioinfo.deri.ie/componets> <http://purl.uniprot.org/uniprot/" + strParts[1] + "> , <http://purl.uniprot.org/uniprot/" + strParts[9] + "> ." + | |
"}"; | |
String corumQueryStr = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(corumQuery, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String corumResult = httpGet(corumQueryStr); | |
String [] corumParts = corumResult.split("\t"); | |
corumScore = Integer.parseInt(corumParts[1]); | |
// -------------------------- GO processing includes only leaves ------- // | |
String goCC1 = goCCList.containsKey(strParts[1]) ? goCCList.get(strParts[1]) : ""; | |
String goCC2 = goCCList.containsKey(strParts[9]) ? goCCList.get(strParts[9]) : ""; | |
if(!(goCC1.isEmpty() || goCC2.isEmpty())) { | |
String [] goCC1parts = goCC1.split(";"); | |
String [] goCC2parts = goCC2.split(";"); | |
for(int cc1 = 0; cc1< goCC1parts.length; cc1++){ | |
for(int cc2 = 0; cc2<goCC2parts.length; cc2++ ){ | |
goCCScore += ((goCC1parts[cc1].matches(goCC2parts[cc2])) ? 1 : 0); | |
} | |
} | |
} | |
String goBP1 = goBPList.containsKey(strParts[1]) ? goBPList.get(strParts[1]) : ""; | |
String goBP2 = goBPList.containsKey(strParts[9]) ? goBPList.get(strParts[9]) : ""; | |
if(!(goBP1.isEmpty() || goBP2.isEmpty())) { | |
String [] goBP1parts = goBP1.split(";"); | |
String [] goBP2parts = goBP2.split(";"); | |
for(int bp1 = 0; bp1< goBP1parts.length; bp1++){ | |
for(int bp2 = 0; bp2<goBP2parts.length; bp2++ ){ | |
goBPScore += ((goBP1parts[bp1].matches(goBP2parts[bp2])) ? 1 : 0); | |
} | |
} | |
} | |
String goMF1 = goMFList.containsKey(strParts[1]) ? goMFList.get(strParts[1]) : ""; | |
String goMF2 = goMFList.containsKey(strParts[9]) ? goMFList.get(strParts[9]) : ""; | |
if(!(goMF1.isEmpty() || goMF2.isEmpty())) { | |
String [] goMF1parts = goMF1.split(";"); | |
String [] goMF2parts = goMF2.split(";"); | |
for(int mf1 = 0; mf1< goMF1parts.length; mf1++){ | |
for(int mf2 = 0; mf2<goMF2parts.length; mf2++ ){ | |
goMFScore += ((goMF1parts[mf1].matches(goMF2parts[mf2])) ? 1 : 0); | |
} | |
} | |
} | |
// ---------------------- Cooccurrence processing here ------------------------- // | |
if(geneList.containsKey(strParts[0]) && geneList.containsKey(strParts[8])) { | |
String connector = (geneList.get(strParts[0]) + "_" + geneList.get(strParts[8])).toLowerCase(); | |
String connector1 = (geneList.get(strParts[8]) + "_" + geneList.get(strParts[0])).toLowerCase(); | |
String coPathQuery = "SELECT * WHERE {" + | |
"GRAPH <http://tcga.deri.ie/pathwayconnectors/> { {" + | |
"<http://tcga.deri.ie/con:"+connector+"> a <http://tcga.deri.ie/schema/keggCon> ;" + | |
"<http://tcga.deri.ie/schema/weight> ?weight" + | |
"} UNION { " + | |
"<http://tcga.deri.ie/con:"+connector1+"> a <http://tcga.deri.ie/schema/keggCon> ;" + | |
"<http://tcga.deri.ie/schema/weight> ?weight" + | |
"}}}"; | |
String coPathQueryURL = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(coPathQuery, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String coPathResults = httpGet(coPathQueryURL); | |
String [] coPathResultsParts = coPathResults.split("\t"); | |
if(coPathResultsParts.length > 1) | |
cooccurPath = Integer.parseInt(coPathResultsParts[1].substring(1,coPathResultsParts[1].length()-1)); | |
String coPubQuery = "SELECT * WHERE {" + | |
"GRAPH <http://tcga.deri.ie/pubmedconnectors/> { {" + | |
"<http://tcga.deri.ie/con:"+connector+"> a <http://tcga.deri.ie/schema/pubmedCon> ;" + | |
"<http://tcga.deri.ie/schema/weight> ?weight" + | |
"} UNION { " + | |
"<http://tcga.deri.ie/con:"+connector1+"> a <http://tcga.deri.ie/schema/pubmedCon> ;" + | |
"<http://tcga.deri.ie/schema/weight> ?weight" + | |
"}}}"; | |
String coPubQueryURL = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(coPubQuery, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String coPubResults = httpGet(coPubQueryURL); | |
String [] coPubResultsParts = coPubResults.split("\t"); | |
if(coPubResultsParts.length > 1) | |
cooccurPub = Integer.parseInt(coPubResultsParts[1].substring(1,coPubResultsParts[1].length()-1)); | |
String coDisQuery = "SELECT * WHERE {" + | |
"GRAPH <http://tcga.deri.ie/diseaseconnectors/> { {" + | |
"<http://tcga.deri.ie/con:"+connector+"> a <http://tcga.deri.ie/schema/omimCon> ;" + | |
"<http://tcga.deri.ie/schema/weight> ?weight" + | |
"} UNION { " + | |
"<http://tcga.deri.ie/con:"+connector1+"> a <http://tcga.deri.ie/schema/omimCon> ;" + | |
"<http://tcga.deri.ie/schema/weight> ?weight" + | |
"}}}"; | |
String coDisQueryURL = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(coDisQuery, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String coDisResults = httpGet(coDisQueryURL); | |
String [] coDisResultsParts = coDisResults.split("\t"); | |
if(coDisResultsParts.length > 1) | |
cooccurDis = Integer.parseInt(coDisResultsParts[1].substring(1,coDisResultsParts[1].length()-1)); | |
} | |
//------- Ideogram Interaction processing here ---- // | |
if(! (strParts[16].isEmpty() || strParts[17].isEmpty())) { | |
/* | |
* SELECT * WHERE {{<http://data.bioinfo.deri.ie/IIinter:chr1_p13.3-chr11_q22.3> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore} UNION {<http://data.bioinfo.deri.ie/IIinter:chr11_q22.3-chr1_p13.3> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore | |
}} | |
*/ | |
String ideogramInter = "chr" + strParts[6] + "_" +strParts[16] + "-chr" + strParts[14] + "_" + strParts[17]; | |
String ideogramInter2 = "chr" + strParts[14] + "_" + strParts[17] + "-chr" + strParts[6] + "_" +strParts[16]; | |
String query = "SELECT * WHERE {{ <http://data.bioinfo.deri.ie/IIinter:"+ideogramInter+"> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore " + | |
"} UNION { <http://data.bioinfo.deri.ie/IIinter:"+ideogramInter2+"> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore" + | |
"}}"; | |
String urlStr = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(query, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String result = httpGet(urlStr); | |
String [] resultParts = result.split("\t"); | |
if(resultParts.length > 1) { | |
if(!resultParts[1].isEmpty()) { | |
ideoScore = Double.parseDouble(resultParts[1].substring(1,resultParts[1].length()-1)); | |
} | |
} | |
} | |
//-------- Domain-Domain Interaction processing here -----/ | |
if(! (strParts[18].matches("NA") || strParts[19].matches("NA")) ) { | |
String [] domain1Parts = strParts[18].split(";"); | |
String [] domain2Parts = strParts[19].split(";"); | |
for(int i = 0; i < domain1Parts.length ; i++){ | |
for(int j = 0; j < domain2Parts.length ; j++){ | |
/* | |
* SELECT ?inferredScore ?score WHERE { {<http://data.bioinfo.deri.ie/DDinter:PF04487_PF03299> a <http://data.bioinfo.deri.ie/DDinter> . | |
OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:PF04487_PF03299> <http://data.bioinfo.deri.ie/score> ?score} . | |
OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:PF04487_PF03299> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore } } | |
UNION {<http://data.bioinfo.deri.ie/DDinter:PF03299_PF04487> a <http://data.bioinfo.deri.ie/DDinter> . | |
OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:PF03299_PF04487> <http://data.bioinfo.deri.ie/score> ?score} . | |
OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:PF03299_PF04487> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore } } | |
} | |
*/ | |
String query = "SELECT * WHERE { { <http://data.bioinfo.deri.ie/DDinter:"+domain1Parts[i]+"_"+domain2Parts[j]+"> a <http://data.bioinfo.deri.ie/DDinter> ."+ | |
"OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:"+domain1Parts[i]+"_"+domain2Parts[j]+"> <http://data.bioinfo.deri.ie/score> ?score} ."+ | |
"OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:"+domain1Parts[i]+"_"+domain2Parts[j]+"> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore }"+ | |
"} UNION { " + | |
"<http://data.bioinfo.deri.ie/DDinter:"+domain2Parts[j]+"_"+domain1Parts[i]+"> a <http://data.bioinfo.deri.ie/DDinter> ."+ | |
"OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:"+domain2Parts[j]+"_"+domain1Parts[i]+"> <http://data.bioinfo.deri.ie/score> ?score} ." + | |
"OPTIONAL {<http://data.bioinfo.deri.ie/DDinter:"+domain2Parts[j]+"_"+domain1Parts[i]+"> <http://data.bioinfo.deri.ie/inferredScore> ?inferredScore}"+ | |
"}}"; | |
// query = http://srvgal78.deri.ie/sparql/?default-graph-uri=&query=+SELECT+*+WHERE+%7B%0D%0A%3Fs+a+%3Chttp%3A%2F%2Fdata.bioinfo.deri.ie%2FDDinter%3E+.%0D%0A%3Fs+%3Chttp%3A%2F%2Fdata.bioinfo.deri.ie%2FinferredScore%3E+%3Fo%0D%0A%7D&format=text%2Ftab-separated-values&timeout=0&debug=on | |
String urlStr = "http://srvgal78.deri.ie/sparql/?default-graph-uri=&query="+URLEncoder.encode(query, "UTF-8")+ "&format=text%2Ftab-separated-values&timeout=0&debug=on"; | |
String result = httpGet(urlStr); | |
String [] resultParts = result.split("\t"); | |
double ddMaxScore = 0.0; | |
for(int k = 2; k < resultParts.length; k++) { | |
double ddCurrentScore = 0.0; | |
if(!resultParts[k].isEmpty()) { | |
ddCurrentScore = Double.parseDouble(resultParts[k].substring(1,resultParts[k].length()-1)); | |
} | |
if(ddCurrentScore > ddMaxScore) | |
ddMaxScore = ddCurrentScore; | |
} | |
ddScore += ddMaxScore; | |
} | |
} | |
} | |
/// ---- Write the scores to output file ----// | |
out.write(strParts[1] + "\t" + strParts[9] + "\t" + ddScore + "\t" + ideoScore + "\t" + coExpressScore + "\t" + corumScore + "\t" + | |
cooccurPub + "\t" + cooccurPath + "\t" + cooccurDis + "\t" + goBPScore + "\t" + goCCScore + "\t" + goMFScore + "\n"); | |
System.out.println(count++); | |
// if(count > 5000) | |
// break; | |
} | |
in.close(); | |
out.close(); | |
} | |
public static String httpGet(String urlStr) { | |
try { | |
URL url = new URL(urlStr); | |
HttpURLConnection conn = | |
(HttpURLConnection) url.openConnection(); | |
// conn.addRequestProperty("Accept", "application/xml"); | |
if (conn.getResponseCode() != 200) { | |
return null; | |
// throw new IOException(conn.getResponseMessage()); | |
} | |
// Buffer the result into a string | |
BufferedReader rd = new BufferedReader( | |
new InputStreamReader(conn.getInputStream())); | |
StringBuilder sb = new StringBuilder(); | |
String line; | |
while ((line = rd.readLine()) != null) { | |
sb.append(line + "\t"); | |
} | |
rd.close(); | |
conn.disconnect(); | |
return sb.toString(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return ""; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment