Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created February 10, 2010 23:11
Show Gist options
  • Save lindenb/67bb728957abb16a680b to your computer and use it in GitHub Desktop.
Save lindenb/67bb728957abb16a680b to your computer and use it in GitHub Desktop.
package biohack;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import org.lindenb.sw.vocabulary.DC;
import org.lindenb.sw.vocabulary.RDF;
import org.lindenb.sw.vocabulary.XSD;
import org.lindenb.util.StringUtils;
import org.lindenb.xml.XMLUtilities;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
public class SRAParser
{
private String prefix="my";
private String namespace="urn:mynamespace:";
private XMLStreamWriter out=null;
public void parse(InputSource is) throws Exception
{
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance();
f.setCoalescing(true);
f.setNamespaceAware(true);
f.setValidating(false);
f.setExpandEntityReferences(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
Document dom= f.newDocumentBuilder().parse(is);
Element root=dom.getDocumentElement();
echo(root);
}
public void start() throws Exception
{
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
this.out= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
this.out.writeStartDocument("UTF-8","1.0");
this.out.writeStartElement("rdf", "RDF", RDF.NS);
this.out.writeNamespace("xsd",XSD.NS);
this.out.writeNamespace("rdf",RDF.NS);
this.out.writeNamespace("dc",DC.NS);
this.out.writeNamespace(this.prefix,this.namespace);
endl();
}
public void end() throws Exception
{
this.out.writeEndDocument();
this.out.flush();
this.out.close();
this.out=null;
}
private void endl() throws XMLStreamException
{
this.out.writeCharacters("\n");
}
protected boolean isContainer(Element root)
{
if(StringUtils.isIn(root.getLocalName(),
"FILES")) return true;
if( root.getNodeName().endsWith("_ATTRIBUTES")) return true;
if( root.getNodeName().endsWith("_LINKS")) return true;
return root.getNodeName().endsWith("_SET");
}
protected void echo(Element root)
throws XMLStreamException
{
if(isContainer(root))
{
for(Element c: XMLUtilities.elements(root))
{
echo(c);
}
return;
}
writeIndividual(root);
}
protected void properties(Element root)
throws XMLStreamException
{
for(Element c:XMLUtilities.elements(root))
{
property(c);
}
}
protected void property(Element root)
throws XMLStreamException
{
if( !root.hasChildNodes() &&
!root.hasAttributes())
{
return;
}
else if(isSimpleNode(root))
{
simpleElement(root);
}
else if(isContainer(root))
{
properties(root);
}
else
{
out.writeStartElement(this.prefix, getPropertyNameFor(root),this.namespace);
endl();
writeIndividual(root);
endl();
out.writeEndElement();
endl();
}
}
protected void writeIndividual(Element root)
throws XMLStreamException
{
this.out.writeStartElement(
getPrefixFor(root),
getIndividualNameFor(root),
getNamespaceFor(root) );
String uri=getResourceFor(root);
if(uri!=null)
{
out.writeAttribute("rdf", RDF.NS, "about", uri);
}
endl();
attributes(root);
properties(root);
this.out.writeEndElement();
endl();
}
protected String getPrefixFor(Node root)
{
if(root.getLocalName().equals("title")) return "dc";
return this.prefix;
}
protected String getNamespaceFor(Node root)
{
if(root.getLocalName().equals("title")) return DC.NS;
return this.prefix;
}
protected boolean isSimpleNode(Node root)
{
if(root.getNodeType()==Node.ATTRIBUTE_NODE) return true;
return !root.hasAttributes() && XMLUtilities.count(root)==0;
}
protected void attributes(Element node)
throws XMLStreamException
{
NamedNodeMap map=node.getAttributes();
for(int i=0;i< map.getLength();++i)
{
if(isIgnorableAttribute(Attr.class.cast(map.item(i)))) continue;
attribute(Attr.class.cast(map.item(i)));
}
}
protected boolean isIgnorableAttribute(Attr att)
{
if(!StringUtils.isBlank(att.getNamespaceURI()))
{
return true;
}
if(att.getLocalName().equals("noNamespaceSchemaLocation")) return true;
return false;
}
protected void attribute(Attr root)
throws XMLStreamException
{
simpleElement(root);
}
protected void simpleElement(Node root)
throws XMLStreamException
{
String value=getValueOf(root);
if(value.isEmpty()) return;
if(isResource(root))
{
writeResource(root);
}
else
{
writeLiteral(root);
}
}
protected void writeResource(Node root)
throws XMLStreamException
{
this.out.writeEmptyElement(
this.prefix,
getLocalNameFor(root),
this.namespace
);
this.out.writeAttribute(
"rdf", RDF.NS, "resource",
getResourceFor(root));
endl();
}
protected void writeLiteral(Node root)
throws XMLStreamException
{
this.out.writeStartElement(
this.prefix,
getLocalNameFor(root),
this.namespace
);
String datatype= getDataType(root);
if(datatype!=null)
{
this.out.writeAttribute("rdf", RDF.NS, "datatype", datatype);
}
String lang= getLang(root);
if(lang!=null)
{
this.out.writeAttribute("xml:lang", lang);
}
this.out.writeCharacters(getValueOf(root));
this.out.writeEndElement();
endl();
}
protected String getLang(Node node)
{
String name=node.getLocalName();
if(StringUtils.isIn(name,"STUDY_DESCRIPTION","LIBRARY_CONSTRUCTION_PROTOCOL"))
{
return "en";
}
return null;
}
protected String getDataType(Node node)
{
String name=node.getLocalName();
if(StringUtils.isIn(name,
"expected_number_reads",
"expected_number_reads",
"expected_number_spots",
"follows_read_index",
"CYCLE_COUNT",
"CYCLE_COUNT",
"FLOW_COUNT",
"SEQUENCE_LENGTH"
))
{
return XSD.NS+"#positiveInteger";
}
if(StringUtils.isIn(name,
"format_code",
"HoldForPeriod",
"serial",
"NUMBER_OF_LEVELS",
"TAXON_ID"))
{
return XSD.NS+"#int";
}
if(StringUtils.isIn(name,
"inform_on_error",
"inform_on_status",
"URL"
))
{
return XSD.NS+"#anyURI";
}
if(StringUtils.isIn(name,
"run_date",
"submission_date"
))
{
return XSD.NS+"#dateTime";
}
if(StringUtils.isIn(name,
"IS_PRIMARY"
))
{
return XSD.NS+"#boolean";
}
if(StringUtils.isIn(name,
"proportion"
))
{
return XSD.NS+"#float";
}
if(StringUtils.isIn(name,
"BASE_COORD",
"CYCLE_COORD"
))
{
return XSD.NS+"#integer";
}
if(StringUtils.isIn(name,
"follows_read_index",
"max_mismatch",
"min_match",
"NOMINAL_LENGTH",
"number_channels",
"precedes_read_index",
"region",
"sector",
"total_reads",
"total_spots",
"ID",
"PROJECT_ID",
"READ_INDEX"))
{
return XSD.NS+"#nonNegativeInteger";
}
if(StringUtils.isIn(name,
"NUMBER_OF_READS_PER_SPOT",
"SPOT_DECODE_METHOD",
"SPOT_LENGTH"))
{
return XSD.NS+"#unsignedInt";
}
if(StringUtils.isIn(name,
"NOMINAL_SDEV",
"MULTIPLIER"))
{
return XSD.NS+"#double";
}
return null;
}
protected String getIndividualNameFor(Node root)
{
String s= getLocalNameFor(root);
s= s.substring(0,1).toUpperCase()+s.substring(1);
return s;
}
protected String getLocalNameFor(Node root)
{
String s= root.getLocalName();
if(s.endsWith("_REF")) s=s.substring(0,s.length()-4);
StringBuilder b=new StringBuilder(s.length());
boolean maj=false;
for(int i=0;i< s.length();++i)
{
char c=s.charAt(i);
if(c=='_' || c=='-')
{
maj=true;
continue;
}
c=(maj?Character.toUpperCase(c):Character.toLowerCase(c));
b.append(c);
maj=false;
}
return b.toString();
}
protected String getPropertyNameFor(Node root)
{
String s= getLocalNameFor(root);
s=s.substring(0,1).toUpperCase()+s.substring(1);
s="has"+s;
return s;
}
protected boolean isResource(Node root)
{
return getResourceFor(root)!=null;
}
protected String getResourceFor(Node root)
{
if(StringUtils.isIn(root.getLocalName(),
"sra_object_type","filetype","checksum_method","LIBRARY_STRATEGY",
"LIBRARY_SOURCE","LIBRARY_SELECTION","READ_TYPE","INSTRUMENT_MODEL",
"SEQUENCE_SPACE"
))
{
return "urn:sra:"+getLocalNameFor(root)+":"+root.getTextContent().replace(' ', '_');
}
if(root.getLocalName().equals("TAXON_ID"))
{
return "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id="+root.getTextContent();
}
if(root.getNodeType()==Node.ELEMENT_NODE)
{
if(root.hasAttributes() &&
Element.class.cast(root).hasAttribute("accession")
)
{
String s=getLocalNameFor(root).toLowerCase();
return "urn:sra:"+
s+":"+
Element.class.cast(root).getAttribute("accession")
;
}
Element c= XMLUtilities.firstChild(root,"URL");
if(c!=null) return c.getTextContent();
}
return null;
}
protected String getValueOf(Node root)
{
String value="";
switch(root.getNodeType())
{
case Node.ATTRIBUTE_NODE:
value= Attr.class.cast(root).getValue();
break;
default:
value= root.getTextContent();
break;
}
value=value.trim();
return value;
}
public static void main(String[] args) {
try
{
SRAParser app=new SRAParser();
int optind=0;
while(optind< args.length)
{
if(args[optind].equals("-h") ||
args[optind].equals("-help") ||
args[optind].equals("--help"))
{
System.err.println("Options:");
System.err.println(" -h help; This screen.");
return;
}
else if(args[optind].equals("-azdazdazd"))
{
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unknown option "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
app.start();
if(optind==args.length)
{
app.parse(new InputSource(System.in));
}
else
{
while(optind< args.length)
{
app.parse(new InputSource(args[optind++]));
}
}
app.end();
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment