/** * Author: Pierre Lindenbaum PhD * plindenbaum@yahoo.fr * Date: 2012-11 * Motivation: RDFGraph from openoffice calc files * */ package oocalc; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.InputStream; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.Attribute; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; import com.hp.hpl.jena.assembler.assemblers.AssemblerBase; import com.hp.hpl.jena.assembler.Assembler; import com.hp.hpl.jena.sparql.core.assembler.AssemblerUtils; import com.hp.hpl.jena.assembler.Mode; import com.hp.hpl.jena.datatypes.RDFDatatype; import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.graph.Node; import com.hp.hpl.jena.graph.Triple; import com.hp.hpl.jena.graph.TripleMatch; import com.hp.hpl.jena.graph.TripleMatchIterator; import com.hp.hpl.jena.graph.impl.GraphBase; import com.hp.hpl.jena.rdf.model.AnonId; import com.hp.hpl.jena.rdf.model.ResourceFactory; import com.hp.hpl.jena.rdf.model.impl.ModelCom; import com.hp.hpl.jena.util.iterator.ExtendedIterator; import com.hp.hpl.jena.util.iterator.NiceIterator; import com.hp.hpl.jena.sparql.core.DatasetImpl; import com.hp.hpl.jena.vocabulary.DC; import com.hp.hpl.jena.vocabulary.RDF; import com.hp.hpl.jena.vocabulary.XSD; import com.hp.hpl.jena.query.Dataset; import org.slf4j.LoggerFactory; import com.hp.hpl.jena.query.*; /** * implementation of a RDF Graph for OpenOffice calc * */ public class OpenOfficeCalcGraph extends GraphBase { /** logger */ protected static final org.slf4j.Logger LOG= LoggerFactory.getLogger("ooffice2rdf"); /** namespaces */ private static final String OFFICE="urn:oasis:names:tc:opendocument:xmlns:office:1.0"; private static final String TABLE="urn:oasis:names:tc:opendocument:xmlns:table:1.0"; private static final String TEXT="urn:oasis:names:tc:opendocument:xmlns:text:1.0"; private static final String NS="http://rdf.lindenb.org/"; /** attributes */ private static final QName number_columns_repeated=new QName(TABLE,"number-columns-repeated","table"); private static final QName number_rows_repeated=new QName(TABLE,"number-rows-repeated","table"); private static final QName value_type=new QName(OFFICE,"value-type","office"); private static final QName value=new QName(OFFICE,"value","office"); private static final QName name=new QName(TABLE,"name","table"); //rdf:type Node private static final Node rdfType=Node.createURI(RDF.type.getURI()); //all open office files private List<File> caclFiles=null; /** static Assembler for OpenOfficeCalcGraph * An assembler creates a Dataset(graph) from a RDF-based configuration file. * It is called by Fuseki */ public static OpenOfficeAssembler assembler = new OpenOfficeAssembler(); public static class OpenOfficeAssembler extends AssemblerBase implements Assembler { @Override public Object open( Assembler a, Resource root, Mode mode ) { //read the configuration an get the files List<File> files=new ArrayList<File>(); StmtIterator iter=root.listProperties(fileRsrc); while(iter.hasNext()) { Statement stmt=iter.nextStatement(); if(!stmt.getObject().isLiteral()) throw new RuntimeException("Not a literal "+stmt); String lit=stmt.getString(); File file=new File(lit); if(!file.exists()) throw new RuntimeException("File not found : "+file); if(!file.getName().endsWith(".ods")) throw new RuntimeException("Not an .ods file : "+file); files.add(file); } iter.close(); OpenOfficeCalcGraph g=new OpenOfficeCalcGraph(files); OpenOfficeCalcModel m=new OpenOfficeCalcModel(g); Dataset ds=new DatasetImpl(m); return ds; } } /** Initializer for FUZEKI */ private static boolean init_called = false ; private static final Resource buildRsrc=ResourceFactory.createResource(NS+"build"); private static final Property fileRsrc=ResourceFactory.createProperty(NS+"file"); /** static initializer, when this class is invoked, * it tells Fuzeki that there is another assembler using Assembler.general * the resource-name for this assembler is this.buildRsrc */ static { init() ; } private static void init() { if(init_called) return; LOG.info("Calling OpenOfficeCalcGraph init"); AssemblerUtils.init(); Assembler.general.implementWith(buildRsrc,assembler); init_called=true; } /** RDF Model for OpenOfficeCalcGraph */ public static class OpenOfficeCalcModel extends ModelCom { public OpenOfficeCalcModel(OpenOfficeCalcGraph g) { super(g); } } /* one row in the spredsheet */ private static class Row { int repeat=1; private List<Cell> cells=new ArrayList<Cell>(); } /* one cell in the spredsheet */ private static class Cell { int repeat=1; String type=null; String value=null; String literal=null; } /** Constructor from an array of OO files */ public OpenOfficeCalcGraph(List<File> calcFiles) { this.caclFiles=new ArrayList<File>(calcFiles); this.getPrefixMapping().setNsPrefix("office", NS); this.getPrefixMapping().setNsPrefix("xsd", XSD.getURI()); this.getPrefixMapping().setNsPrefix("dc", DC.getURI()); } @Override protected ExtendedIterator<Triple> graphBaseFind(TripleMatch matcher) { return new TripleMatchIterator((Triple)matcher, new CellIterator()); } /** parse the openoffice files and get the Triples */ private class CellIterator extends NiceIterator<Triple> { /** current index in array of OO files */ private int fileIndex=-1; /** buffer of triples */ private List<Triple> buffer=new LinkedList<Triple>(); /** next triple to be returned */ private Triple next=null; /** was hasNext() called ? */ private boolean hasNextCalled=false; /** current OO file opened */ private File ioFile=null; /** Zip Handler for OO file */ private ZipFile zipFile=null; /** Input Stream for current Zip entry */ private InputStream zipInputStream; /** xml-handler for current zip entry */ private XMLEventReader xmlEventReader; /* rdf subject for file */ private Node fileRsrc=null; /* rdf subject for tab */ private Node tabRsrc=null; /** current tab index */ private int tabIndex=0; /* current colun */ private int X=0; /** current row */ private int Y=0; private void add(Node s,Node p,Node o) { this.buffer.add(Triple.create(s, p, o)); } public CellIterator() { } private boolean isA(XMLEvent evt,String ns,String localName) { QName q=null; if(evt.isStartElement()) { q=evt.asStartElement().getName(); } else if(evt.isEndElement()) { q=evt.asEndElement().getName(); } return q!=null && q.getNamespaceURI().equals(ns) && q.getLocalPart().equals(localName) ; } @Override public boolean hasNext() { if(!hasNextCalled) { hasNextCalled=true; next=null; for(;;) { if(!buffer.isEmpty()) { next=buffer.remove(0); break; } try { if(xmlEventReader==null) { //open next file if(fileIndex+1>=OpenOfficeCalcGraph.this.caclFiles.size()) break; this.fileIndex++; this.tabIndex=0; //open XML StaX reader for current OO file XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); try { this.ioFile=OpenOfficeCalcGraph.this.caclFiles.get(this.fileIndex); this.zipFile=new ZipFile(this.ioFile); ZipEntry zipEntry=zipFile.getEntry("content.xml"); if(zipEntry==null) throw new RuntimeException("Cannot get content.xml"); this.zipInputStream=this.zipFile.getInputStream(zipEntry); xmlEventReader= xmlInputFactory.createXMLEventReader(this.zipInputStream); //describe the file as RDF this.fileRsrc=Node.createURI(this.ioFile.toURI().toASCIIString()); add(this.fileRsrc,rdfType,Node.createURI(NS+"Spreadsheet")); add(this.fileRsrc,Node.createURI(DC.title.getURI()),Node.createLiteral(this.ioFile.getName())); continue; } catch (Exception e) { throw new RuntimeException(e); } } if(xmlEventReader.hasNext()) { Attribute att=null; XMLEvent evt=xmlEventReader.nextEvent(); if(evt.isStartElement()) { StartElement E=evt.asStartElement(); if(isA(E,TABLE,"table")) { att=E.getAttributeByName(name); this.tabIndex++; //describe the tab as RDF this.tabRsrc=Node.createURI(this.ioFile.toURI().toASCIIString()+"/t"+tabIndex); add(this.tabRsrc,Node.createURI(NS+"file"),this.fileRsrc); add(this.tabRsrc,rdfType,Node.createURI(NS+"Table")); add(this.tabRsrc,Node.createURI(DC.title.getURI()),Node.createLiteral(att.getValue())); this.X=0; this.Y=0; } else if(isA(E,TABLE,"table-row")) { //parse the row Row row=parseRow(E); //create the statements for that row for(int i=0;i< row.repeat;++i) { this.X=0; this.Y++; for(Cell cell:row.cells) { for(int j=0;j< cell.repeat;++j) { this.X++; if(cell.value==null && cell.literal==null) continue; Node subject=Node.createURI(this.ioFile.toURI().toASCIIString()+"/t"+tabIndex+"/y"+Y+"/x"+X); add(subject,Node.createURI(NS+"table"),this.tabRsrc); add(subject,rdfType,Node.createURI(NS+"Cell")); add(subject,Node.createURI(NS+"X"),Node.createLiteral(String.valueOf(X),null,XSDDatatype.XSDint)); add(subject,Node.createURI(NS+"Y"),Node.createLiteral(String.valueOf(Y),null,XSDDatatype.XSDint)); Node cellValue=null; if(cell.type!=null && cell.value!=null) { XSDDatatype dataType=XSDDatatype.XSDstring; if(cell.type.equals("float")) { dataType=XSDDatatype.XSDfloat; } else if(cell.type.equals("int")) { dataType=XSDDatatype.XSDint; } cellValue=Node.createLiteral(cell.value, null, dataType); } else { cellValue=Node.createLiteral(String.valueOf(cell.literal)); } add( subject, Node.createURI(NS+"value"), cellValue ); } } } } } else if(evt.isEndElement()) { if(isA(evt,TABLE,"table")) { this.tabRsrc=null; } } } else //we're done for that file. { this.xmlEventReader.close(); this.zipInputStream.close(); this.zipFile.close(); this.xmlEventReader=null; this.zipInputStream=null; this.zipFile=null; this.fileRsrc=null; this.ioFile=null; } } catch(Exception err) { throw new RuntimeException(err); } } } return next!=null; } @Override public void close() { try { if(this.xmlEventReader!=null) this.xmlEventReader.close(); } catch (Exception e) {} this.xmlEventReader=null; try { if(this.zipInputStream!=null) this.zipInputStream.close(); } catch (Exception e) {} this.zipInputStream=null; try { if(this.zipFile!=null) this.zipFile.close(); } catch (Exception e) {} this.zipFile=null; this.buffer.clear(); this.fileIndex=caclFiles.size(); } @Override public Triple next() { if(!hasNextCalled) hasNext(); if(!hasNext()) throw new IllegalStateException(); Triple t=next; next=null; hasNextCalled=false; return t; } /** parses a table:table-row */ private Row parseRow(StartElement root) throws XMLStreamException { Row row=new Row(); Attribute att=root.getAttributeByName(number_rows_repeated); if(att!=null) { row.repeat=Integer.parseInt(att.getValue()); } while(this.xmlEventReader.hasNext()) { XMLEvent evt=this.xmlEventReader.nextEvent(); if(evt.isStartElement()) { StartElement E=evt.asStartElement(); if(isA(E,TABLE,"table-cell")) { row.cells.add(parseCell(E)); } } else if(evt.isEndElement()) { if(isA(evt,TABLE,"table-row")) { break; } } } return row; } /** parses a table:table-cell */ private Cell parseCell(StartElement root) throws XMLStreamException { Cell cell=new Cell(); Attribute att=root.getAttributeByName(number_columns_repeated); if(att!=null) { cell.repeat=Integer.parseInt(att.getValue()); } att=root.getAttributeByName(value_type); if(att!=null) { cell.type=att.getValue(); } att=root.getAttributeByName(value); if(att!=null) { cell.value=att.getValue(); cell.literal=cell.value; } while(this.xmlEventReader.hasNext()) { XMLEvent evt=this.xmlEventReader.nextEvent(); if(evt.isStartElement()) { StartElement E=evt.asStartElement(); if(isA(E,TEXT,"p")) { cell.literal=parseText(E); } } else if(evt.isEndElement()) { if(isA(evt,TABLE,"table-cell")) { break; } } } return cell; } /** returns the content of <text:p/> */ private String parseText(StartElement root) throws XMLStreamException { StringBuilder b=new StringBuilder(); while(xmlEventReader.hasNext()) { XMLEvent evt=this.xmlEventReader.nextEvent(); if(evt.isStartElement()) { throw new IllegalStateException(); } else if(evt.isEndElement()) { if(isA(evt,TEXT,"p")) { return b.toString(); } } else if(evt.isCharacters()) { b.append(evt.asCharacters().getData()); } } throw new IllegalStateException(); } } public static void main(String[] args) throws Exception { if(args.length<2) { System.err.println("Usage: query.sparql file1.ods, file2.ods... filen.ods"); return; } List<File> files=new ArrayList<File>(); for(int optind=1;optind< args.length;++optind) { files.add(new File(args[optind])); } OpenOfficeCalcGraph g=new OpenOfficeCalcGraph(files); OpenOfficeCalcModel m=new OpenOfficeCalcModel(g); com.hp.hpl.jena.query.Query query = QueryFactory.read(args[0]) ; LOG.info("starting query"); QueryExecution qexec = QueryExecutionFactory.create(query, m) ; try { ResultSet results = qexec.execSelect(); ResultSetFormatter.out(System.out,results,g.getPrefixMapping()); } finally { qexec.close() ; } } }