-
-
Save lindenb/67bb728957abb16a680b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package biohack; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.stream.XMLOutputFactory; | |
import javax.xml.stream.XMLStreamException; | |
import javax.xml.stream.XMLStreamWriter; | |
import org.lindenb.sw.vocabulary.DC; | |
import org.lindenb.sw.vocabulary.RDF; | |
import org.lindenb.sw.vocabulary.XSD; | |
import org.lindenb.util.StringUtils; | |
import org.lindenb.xml.XMLUtilities; | |
import org.w3c.dom.Attr; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Element; | |
import org.w3c.dom.NamedNodeMap; | |
import org.w3c.dom.Node; | |
import org.xml.sax.InputSource; | |
public class SRAParser | |
{ | |
private String prefix="my"; | |
private String namespace="urn:mynamespace:"; | |
private XMLStreamWriter out=null; | |
public void parse(InputSource is) throws Exception | |
{ | |
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); | |
f.setCoalescing(true); | |
f.setNamespaceAware(true); | |
f.setValidating(false); | |
f.setExpandEntityReferences(true); | |
f.setIgnoringComments(true); | |
f.setIgnoringElementContentWhitespace(true); | |
Document dom= f.newDocumentBuilder().parse(is); | |
Element root=dom.getDocumentElement(); | |
echo(root); | |
} | |
public void start() throws Exception | |
{ | |
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); | |
this.out= xmlfactory.createXMLStreamWriter(System.out,"UTF-8"); | |
this.out.writeStartDocument("UTF-8","1.0"); | |
this.out.writeStartElement("rdf", "RDF", RDF.NS); | |
this.out.writeNamespace("xsd",XSD.NS); | |
this.out.writeNamespace("rdf",RDF.NS); | |
this.out.writeNamespace("dc",DC.NS); | |
this.out.writeNamespace(this.prefix,this.namespace); | |
endl(); | |
} | |
public void end() throws Exception | |
{ | |
this.out.writeEndDocument(); | |
this.out.flush(); | |
this.out.close(); | |
this.out=null; | |
} | |
private void endl() throws XMLStreamException | |
{ | |
this.out.writeCharacters("\n"); | |
} | |
protected boolean isContainer(Element root) | |
{ | |
if(StringUtils.isIn(root.getLocalName(), | |
"FILES")) return true; | |
if( root.getNodeName().endsWith("_ATTRIBUTES")) return true; | |
if( root.getNodeName().endsWith("_LINKS")) return true; | |
return root.getNodeName().endsWith("_SET"); | |
} | |
protected void echo(Element root) | |
throws XMLStreamException | |
{ | |
if(isContainer(root)) | |
{ | |
for(Element c: XMLUtilities.elements(root)) | |
{ | |
echo(c); | |
} | |
return; | |
} | |
writeIndividual(root); | |
} | |
protected void properties(Element root) | |
throws XMLStreamException | |
{ | |
for(Element c:XMLUtilities.elements(root)) | |
{ | |
property(c); | |
} | |
} | |
protected void property(Element root) | |
throws XMLStreamException | |
{ | |
if( !root.hasChildNodes() && | |
!root.hasAttributes()) | |
{ | |
return; | |
} | |
else if(isSimpleNode(root)) | |
{ | |
simpleElement(root); | |
} | |
else if(isContainer(root)) | |
{ | |
properties(root); | |
} | |
else | |
{ | |
out.writeStartElement(this.prefix, getPropertyNameFor(root),this.namespace); | |
endl(); | |
writeIndividual(root); | |
endl(); | |
out.writeEndElement(); | |
endl(); | |
} | |
} | |
protected void writeIndividual(Element root) | |
throws XMLStreamException | |
{ | |
this.out.writeStartElement( | |
getPrefixFor(root), | |
getIndividualNameFor(root), | |
getNamespaceFor(root) ); | |
String uri=getResourceFor(root); | |
if(uri!=null) | |
{ | |
out.writeAttribute("rdf", RDF.NS, "about", uri); | |
} | |
endl(); | |
attributes(root); | |
properties(root); | |
this.out.writeEndElement(); | |
endl(); | |
} | |
protected String getPrefixFor(Node root) | |
{ | |
if(root.getLocalName().equals("title")) return "dc"; | |
return this.prefix; | |
} | |
protected String getNamespaceFor(Node root) | |
{ | |
if(root.getLocalName().equals("title")) return DC.NS; | |
return this.prefix; | |
} | |
protected boolean isSimpleNode(Node root) | |
{ | |
if(root.getNodeType()==Node.ATTRIBUTE_NODE) return true; | |
return !root.hasAttributes() && XMLUtilities.count(root)==0; | |
} | |
protected void attributes(Element node) | |
throws XMLStreamException | |
{ | |
NamedNodeMap map=node.getAttributes(); | |
for(int i=0;i< map.getLength();++i) | |
{ | |
if(isIgnorableAttribute(Attr.class.cast(map.item(i)))) continue; | |
attribute(Attr.class.cast(map.item(i))); | |
} | |
} | |
protected boolean isIgnorableAttribute(Attr att) | |
{ | |
if(!StringUtils.isBlank(att.getNamespaceURI())) | |
{ | |
return true; | |
} | |
if(att.getLocalName().equals("noNamespaceSchemaLocation")) return true; | |
return false; | |
} | |
protected void attribute(Attr root) | |
throws XMLStreamException | |
{ | |
simpleElement(root); | |
} | |
protected void simpleElement(Node root) | |
throws XMLStreamException | |
{ | |
String value=getValueOf(root); | |
if(value.isEmpty()) return; | |
if(isResource(root)) | |
{ | |
writeResource(root); | |
} | |
else | |
{ | |
writeLiteral(root); | |
} | |
} | |
protected void writeResource(Node root) | |
throws XMLStreamException | |
{ | |
this.out.writeEmptyElement( | |
this.prefix, | |
getLocalNameFor(root), | |
this.namespace | |
); | |
this.out.writeAttribute( | |
"rdf", RDF.NS, "resource", | |
getResourceFor(root)); | |
endl(); | |
} | |
protected void writeLiteral(Node root) | |
throws XMLStreamException | |
{ | |
this.out.writeStartElement( | |
this.prefix, | |
getLocalNameFor(root), | |
this.namespace | |
); | |
String datatype= getDataType(root); | |
if(datatype!=null) | |
{ | |
this.out.writeAttribute("rdf", RDF.NS, "datatype", datatype); | |
} | |
String lang= getLang(root); | |
if(lang!=null) | |
{ | |
this.out.writeAttribute("xml:lang", lang); | |
} | |
this.out.writeCharacters(getValueOf(root)); | |
this.out.writeEndElement(); | |
endl(); | |
} | |
protected String getLang(Node node) | |
{ | |
String name=node.getLocalName(); | |
if(StringUtils.isIn(name,"STUDY_DESCRIPTION","LIBRARY_CONSTRUCTION_PROTOCOL")) | |
{ | |
return "en"; | |
} | |
return null; | |
} | |
protected String getDataType(Node node) | |
{ | |
String name=node.getLocalName(); | |
if(StringUtils.isIn(name, | |
"expected_number_reads", | |
"expected_number_reads", | |
"expected_number_spots", | |
"follows_read_index", | |
"CYCLE_COUNT", | |
"CYCLE_COUNT", | |
"FLOW_COUNT", | |
"SEQUENCE_LENGTH" | |
)) | |
{ | |
return XSD.NS+"#positiveInteger"; | |
} | |
if(StringUtils.isIn(name, | |
"format_code", | |
"HoldForPeriod", | |
"serial", | |
"NUMBER_OF_LEVELS", | |
"TAXON_ID")) | |
{ | |
return XSD.NS+"#int"; | |
} | |
if(StringUtils.isIn(name, | |
"inform_on_error", | |
"inform_on_status", | |
"URL" | |
)) | |
{ | |
return XSD.NS+"#anyURI"; | |
} | |
if(StringUtils.isIn(name, | |
"run_date", | |
"submission_date" | |
)) | |
{ | |
return XSD.NS+"#dateTime"; | |
} | |
if(StringUtils.isIn(name, | |
"IS_PRIMARY" | |
)) | |
{ | |
return XSD.NS+"#boolean"; | |
} | |
if(StringUtils.isIn(name, | |
"proportion" | |
)) | |
{ | |
return XSD.NS+"#float"; | |
} | |
if(StringUtils.isIn(name, | |
"BASE_COORD", | |
"CYCLE_COORD" | |
)) | |
{ | |
return XSD.NS+"#integer"; | |
} | |
if(StringUtils.isIn(name, | |
"follows_read_index", | |
"max_mismatch", | |
"min_match", | |
"NOMINAL_LENGTH", | |
"number_channels", | |
"precedes_read_index", | |
"region", | |
"sector", | |
"total_reads", | |
"total_spots", | |
"ID", | |
"PROJECT_ID", | |
"READ_INDEX")) | |
{ | |
return XSD.NS+"#nonNegativeInteger"; | |
} | |
if(StringUtils.isIn(name, | |
"NUMBER_OF_READS_PER_SPOT", | |
"SPOT_DECODE_METHOD", | |
"SPOT_LENGTH")) | |
{ | |
return XSD.NS+"#unsignedInt"; | |
} | |
if(StringUtils.isIn(name, | |
"NOMINAL_SDEV", | |
"MULTIPLIER")) | |
{ | |
return XSD.NS+"#double"; | |
} | |
return null; | |
} | |
protected String getIndividualNameFor(Node root) | |
{ | |
String s= getLocalNameFor(root); | |
s= s.substring(0,1).toUpperCase()+s.substring(1); | |
return s; | |
} | |
protected String getLocalNameFor(Node root) | |
{ | |
String s= root.getLocalName(); | |
if(s.endsWith("_REF")) s=s.substring(0,s.length()-4); | |
StringBuilder b=new StringBuilder(s.length()); | |
boolean maj=false; | |
for(int i=0;i< s.length();++i) | |
{ | |
char c=s.charAt(i); | |
if(c=='_' || c=='-') | |
{ | |
maj=true; | |
continue; | |
} | |
c=(maj?Character.toUpperCase(c):Character.toLowerCase(c)); | |
b.append(c); | |
maj=false; | |
} | |
return b.toString(); | |
} | |
protected String getPropertyNameFor(Node root) | |
{ | |
String s= getLocalNameFor(root); | |
s=s.substring(0,1).toUpperCase()+s.substring(1); | |
s="has"+s; | |
return s; | |
} | |
protected boolean isResource(Node root) | |
{ | |
return getResourceFor(root)!=null; | |
} | |
protected String getResourceFor(Node root) | |
{ | |
if(StringUtils.isIn(root.getLocalName(), | |
"sra_object_type","filetype","checksum_method","LIBRARY_STRATEGY", | |
"LIBRARY_SOURCE","LIBRARY_SELECTION","READ_TYPE","INSTRUMENT_MODEL", | |
"SEQUENCE_SPACE" | |
)) | |
{ | |
return "urn:sra:"+getLocalNameFor(root)+":"+root.getTextContent().replace(' ', '_'); | |
} | |
if(root.getLocalName().equals("TAXON_ID")) | |
{ | |
return "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id="+root.getTextContent(); | |
} | |
if(root.getNodeType()==Node.ELEMENT_NODE) | |
{ | |
if(root.hasAttributes() && | |
Element.class.cast(root).hasAttribute("accession") | |
) | |
{ | |
String s=getLocalNameFor(root).toLowerCase(); | |
return "urn:sra:"+ | |
s+":"+ | |
Element.class.cast(root).getAttribute("accession") | |
; | |
} | |
Element c= XMLUtilities.firstChild(root,"URL"); | |
if(c!=null) return c.getTextContent(); | |
} | |
return null; | |
} | |
protected String getValueOf(Node root) | |
{ | |
String value=""; | |
switch(root.getNodeType()) | |
{ | |
case Node.ATTRIBUTE_NODE: | |
value= Attr.class.cast(root).getValue(); | |
break; | |
default: | |
value= root.getTextContent(); | |
break; | |
} | |
value=value.trim(); | |
return value; | |
} | |
public static void main(String[] args) { | |
try | |
{ | |
SRAParser app=new SRAParser(); | |
int optind=0; | |
while(optind< args.length) | |
{ | |
if(args[optind].equals("-h") || | |
args[optind].equals("-help") || | |
args[optind].equals("--help")) | |
{ | |
System.err.println("Options:"); | |
System.err.println(" -h help; This screen."); | |
return; | |
} | |
else if(args[optind].equals("-azdazdazd")) | |
{ | |
} | |
else if(args[optind].equals("--")) | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind].startsWith("-")) | |
{ | |
System.err.println("Unknown option "+args[optind]); | |
return; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
app.start(); | |
if(optind==args.length) | |
{ | |
app.parse(new InputSource(System.in)); | |
} | |
else | |
{ | |
while(optind< args.length) | |
{ | |
app.parse(new InputSource(args[optind++])); | |
} | |
} | |
app.end(); | |
} | |
catch(Throwable err) | |
{ | |
err.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment