Last active
March 6, 2019 13:29
-
-
Save lindenb/293389 to your computer and use it in GitHub Desktop.
see http://plindenbaum.blogspot.com/2010/02/using-fasta-file-as-source-of-rdf.html rdf arq jena fasta bioinformatics sparql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
Author: Pierre Lindenbaum PhD | |
http://plindenbaum.blogspot.com | |
January 2010 | |
About: this code makes a Fasta File a source of Triples for a Jena RDF Graph | |
*/ | |
package test; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.PushbackReader; | |
import java.util.Iterator; | |
import java.util.Stack; | |
import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; | |
import com.hp.hpl.jena.graph.Node; | |
import com.hp.hpl.jena.graph.Triple; | |
import com.hp.hpl.jena.graph.TripleMatch; | |
import com.hp.hpl.jena.graph.impl.GraphBase; | |
import com.hp.hpl.jena.query.Query; | |
import com.hp.hpl.jena.query.QueryExecution; | |
import com.hp.hpl.jena.query.QueryExecutionFactory; | |
import com.hp.hpl.jena.query.QueryFactory; | |
import com.hp.hpl.jena.query.QuerySolution; | |
import com.hp.hpl.jena.query.ResultSet; | |
import com.hp.hpl.jena.rdf.model.Literal; | |
import com.hp.hpl.jena.rdf.model.Model; | |
import com.hp.hpl.jena.rdf.model.ModelFactory; | |
import com.hp.hpl.jena.rdf.model.ResIterator; | |
import com.hp.hpl.jena.rdf.model.StmtIterator; | |
import com.hp.hpl.jena.shared.JenaException; | |
import com.hp.hpl.jena.util.iterator.ExtendedIterator; | |
import com.hp.hpl.jena.util.iterator.NiceIterator; | |
import com.hp.hpl.jena.vocabulary.DC; | |
import com.hp.hpl.jena.vocabulary.RDF; | |
/** | |
* FastaModel | |
* Implements a Graph | |
*/ | |
public class FastaModel | |
extends GraphBase | |
{ | |
/** the source of the fasta sequence */ | |
private File fastaFile=null; | |
/** a simple fasta sequence */ | |
private static class FastaSequence | |
{ | |
StringBuilder name=new StringBuilder(); | |
StringBuilder sequence=new StringBuilder(); | |
} | |
/** an iterator scanning a fasta file a returning some | |
* com.hp.hpl.jena.graph.Triple | |
* */ | |
private class FastaIterator | |
extends NiceIterator<Triple> | |
{ | |
/** the file reader */ | |
private PushbackReader reader; | |
/** a filter for the triples */ | |
private Triple filter; | |
/** the queue of triple to be returned */ | |
private Stack<Triple> triples_queue =new Stack<Triple>(); | |
/** constructor opens the file reader | |
* @param matcher the filter of triples | |
* */ | |
FastaIterator(TripleMatch matcher) throws IOException | |
{ | |
this.filter=matcher.asTriple(); | |
try | |
{ | |
this.reader=new PushbackReader(new FileReader(FastaModel.this.fastaFile)); | |
} | |
catch (IOException e) | |
{ | |
throw new JenaException(e); | |
} | |
} | |
/** read the next FastaSequence in the stream or return null */ | |
private FastaSequence readNext() throws IOException | |
{ | |
if(this.reader==null) return null; | |
int c; | |
FastaSequence seq=null; | |
while((c=this.reader.read())!=-1) | |
{ | |
if(c=='>') | |
{ | |
if(seq!=null) | |
{ | |
this.reader.unread(c); | |
return seq; | |
} | |
seq=new FastaSequence(); | |
while((c=this.reader.read())!=-1) | |
{ | |
if(c=='\n') break; | |
seq.name.append((char)c); | |
} | |
} | |
else if(seq!=null && Character.isLetter(c)) | |
{ | |
seq.sequence.append((char)c); | |
} | |
} | |
this.close(); | |
return seq; | |
} | |
/** chech if there are some triples in the queue | |
* if not look if there is one more FastaSequence in the file | |
* this FastaSequence is tranformed to a set of Triples that will | |
* be added to the triples_queue if they match this.matcher | |
* */ | |
@Override | |
public boolean hasNext() | |
{ | |
if(!triples_queue.isEmpty()) return true; | |
if(this.reader==null) return false; | |
try | |
{ | |
/* loop until the queue is not empty or the stream is closed */ | |
while(this.triples_queue.isEmpty()) | |
{ | |
//try to get a new fasta sequence | |
FastaSequence seq=readNext(); | |
if(seq==null) return false; | |
String name=seq.name.toString(); | |
//check it is a genbank file with a gi | |
if(!name.startsWith("gi|")) | |
{ | |
continue; | |
} | |
int i=name.indexOf('|',3); | |
if(i==-1) continue; | |
//create the subject | |
Node subject =Node.createURI("http://www.ncbi.nlm.nih.gov/nuccore/"+name.substring(3,i)); | |
//make a triple for the rdf:type | |
Triple triple=new Triple( | |
subject, | |
RDF.type.asNode(), | |
Node.createURI("urn:lindenb:ontology:Sequence") | |
); | |
//append this triple to the queue if it is accepted by this.filter | |
if(this.filter.asTriple().matches(triple)) | |
{ | |
this.triples_queue.add(triple); | |
} | |
//make a triple for the dc:title | |
triple=new Triple( | |
subject, | |
DC.title.asNode(), | |
Node.createLiteral(name) | |
); | |
//append this triple to the queue if it is accepted by this.filter | |
if(this.filter.asTriple().matches(triple)) | |
{ | |
this.triples_queue.add(triple); | |
} | |
//make a triple for the DNA sequence | |
triple=new Triple( | |
subject, | |
Node.createURI("urn:lindenb:ontology:sequence"), | |
Node.createLiteral(seq.sequence.toString()) | |
); | |
//append this triple to the queue if it is accepted by this.filter | |
if(this.filter.asTriple().matches(triple)) | |
{ | |
this.triples_queue.add(triple); | |
} | |
//make a triple for the size of this sequence | |
triple=new Triple( | |
subject, | |
Node.createURI("urn:lindenb:ontology:length"), | |
Node.createLiteral(String.valueOf(seq.sequence.length()),null,XSDDatatype.XSDint) | |
); | |
//append this triple to the queue if it is accepted by this.filter | |
if(this.filter.asTriple().matches(triple)) | |
{ | |
this.triples_queue.add(triple); | |
} | |
} | |
} | |
catch (IOException e) | |
{ | |
close(); | |
throw new JenaException(e); | |
} | |
return !triples_queue.isEmpty(); | |
} | |
@Override | |
public Triple next() | |
{ | |
if(this.triples_queue.isEmpty()) hasNext(); | |
if(this.triples_queue.isEmpty()) throw new IllegalStateException(); | |
return this.triples_queue.pop(); | |
} | |
@Override | |
public void close() | |
{ | |
try | |
{ | |
if(this.reader!=null) reader.close(); | |
} | |
catch (IOException e) | |
{ | |
throw new JenaException(e); | |
} | |
finally | |
{ | |
this.reader=null; | |
super.close(); | |
} | |
} | |
} | |
@Override | |
protected ExtendedIterator<Triple> graphBaseFind(TripleMatch matcher) | |
{ | |
try | |
{ | |
return new FastaIterator(matcher); | |
} | |
catch (IOException e) | |
{ | |
throw new JenaException(e); | |
} | |
} | |
public FastaModel(File fastaFile) | |
{ | |
this.fastaFile=fastaFile; | |
} | |
@Override | |
public void close() | |
{ | |
super.close(); | |
System.err.println("Close called"); | |
} | |
public static void main(String[] args) | |
{ | |
try | |
{ | |
Model m=ModelFactory.createModelForGraph( | |
new FastaModel( | |
new File("rotavirus.fa") | |
)); | |
StmtIterator i=m.listStatements(); | |
while(i.hasNext()) | |
{ | |
System.err.println(i.next()); | |
} | |
System.err.println("OK"); | |
ResIterator r=m.listSubjects(); | |
while(r.hasNext()) | |
{ | |
System.err.println(r.next()); | |
} | |
r.close(); | |
i=m.listStatements(null,DC.title,(Literal)null); | |
while(i.hasNext()) | |
{ | |
System.err.println(i.next()); | |
} | |
System.err.println("OK"); | |
Query query=QueryFactory.create( | |
"SELECT ?Seq1 ?Len1 ?Seq2 ?Len2" + | |
"{" + | |
"?Seq1 a <urn:lindenb:ontology:Sequence> . " + | |
"?Seq1 <urn:lindenb:ontology:length> ?Len1 . " + | |
"?Seq2 a <urn:lindenb:ontology:Sequence> . " + | |
"?Seq2 <urn:lindenb:ontology:length> ?Len2 . " + | |
"FILTER (?Seq1!=?Seq2 && ?Len1 < ?Len2) "+ | |
"}" | |
); | |
QueryExecution execution = QueryExecutionFactory.create(query, m); | |
ResultSet row=execution.execSelect(); | |
while(row.hasNext()) | |
{ | |
QuerySolution solution=row.next(); | |
for(Iterator<String> si=solution.varNames();si.hasNext();) | |
{ | |
String name=si.next(); | |
System.out.println(name+" : "+solution.get(name)); | |
} | |
System.out.println(); | |
} | |
} | |
catch (Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment