Skip to content

Instantly share code, notes, and snippets.

@unaipme
Created May 9, 2019 14:12
Show Gist options
  • Save unaipme/853c8c0b55635c93774a837aa139027b to your computer and use it in GitHub Desktop.
Save unaipme/853c8c0b55635c93774a837aa139027b to your computer and use it in GitHub Desktop.
(Custom) CSV to XML
package eus.unai.jena;
import org.apache.jena.rdf.model.*;
import org.apache.jena.vocabulary.RDF;
import org.apache.jena.vocabulary.RDFS;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.function.Consumer;
import java.util.function.Function;
public class Main {
private static final String DATA_URI_TEMPLATE = "http://localhost:8890/articles#%s";
private static final String SCH_URI_TEMPLATE = "http://localhost:8890/articles#%s";
private static final Function<String, String> dataUri = uri -> String.format(DATA_URI_TEMPLATE, uri);
private static final Function<String, String> schUri = uri -> String.format(SCH_URI_TEMPLATE, uri);
private static String toUnicode(String str) {
return str.replaceAll("á", "a")
.replaceAll("ä", "a")
.replaceAll("à", "a")
.replaceAll("â", "a")
.replaceAll("é", "e")
.replaceAll("ë", "e")
.replaceAll("è", "e")
.replaceAll("ê", "e")
.replaceAll("í", "i")
.replaceAll("ï", "i")
.replaceAll("ì", "i")
.replaceAll("î", "i")
.replaceAll("ó", "o")
.replaceAll("ö", "o")
.replaceAll("ò", "o")
.replaceAll("ô", "o")
.replaceAll("ú", "u")
.replaceAll("ü", "u")
.replaceAll("ù", "u")
.replaceAll("û", "u");
}
private static String cu(String str) {
return str.replaceAll("\\.", "")
.replaceAll("\\(", "")
.replaceAll("\\)", "")
.replaceAll(",", "")
.replaceAll("\"", "")
.replaceAll("'", "")
.replaceAll("\\[", "")
.replaceAll("]", "")
.replaceAll(":", "")
.replaceAll("%", "percent")
.replaceAll("#", "pound")
.replaceAll("/", "-")
.replaceAll(" ", "-");
}
public static void main(String [] args) throws Exception {
Model model = ModelFactory.createDefaultModel();
model.read(new FileInputStream(new File("/home/unai/final-tbox.owl")), null);
Random random = new Random();
Map<String, Resource> authorResources = new HashMap<>();
Map<String, Resource> articleResources = new HashMap<>();
Map<String, Journal> journalResources = new HashMap<>();
Map<String, Resource> conferenceResources = new HashMap<>();
final Property propWrites = model.createProperty(schUri.apply( "writes"));
final Property propPublishedIn = model.createProperty(schUri.apply("publishedIn"));
final Property propEdits = model.createProperty(schUri.apply("edits"));
final Property propSpeaksAt = model.createProperty(schUri.apply("speaksAt"));
final Property propReviews = model.createProperty(schUri.apply("reviews"));
try (InputStreamReader isr = new InputStreamReader(Files.newInputStream(Paths.get("dblp_article.csv")));
BufferedReader br = new BufferedReader(isr)) {
//final Property propDoi = model.createProperty(schUri.apply( "doi"));
final Property propBelongsTo = model.createProperty(schUri.apply("belongsTo"));
br.lines().skip(1).limit(5000).forEach(l -> {
try {
String[] columns = l.split(";");
String articleName = columns[27];
if ("".equals(articleName)) return;
//String doi = columns[12];
Resource articleResource = model.createResource(dataUri.apply(toUnicode(cu(articleName))))
//.addLiteral(propDoi, doi)
.addLiteral(RDFS.label, articleName)
.addProperty(RDF.type, model.getProperty(schUri.apply("Paper")));
String[] authors = columns[1].split("\\|");
for (String author : authors) {
if ("".equals(author)) continue;
String id = toUnicode(cu(author));
Resource authorResource = null;
if (!authorResources.containsKey(id)) {
authorResource = model.createResource(dataUri.apply(id))
.addProperty(RDFS.label, author)
.addProperty(RDF.type, model.getProperty(schUri.apply("Person")));
authorResources.put(id, authorResource);
} else {
authorResource = authorResources.get(id);
}
model.add(authorResource, propWrites, articleResource);
}
String journalName = columns[15];
if ("".equals(journalName)) return;
Integer volume = Integer.parseInt(columns[31]);
Resource volumeResource;
if (!journalResources.containsKey(journalName)) {
Resource journalResource = model.createResource(dataUri.apply(cu(journalName)))
.addLiteral(RDFS.label, journalName)
.addProperty(RDF.type, model.getProperty(schUri.apply("Journal")));
Journal journal = new Journal(journalResource);
volumeResource = model.createResource(dataUri.apply(cu(journalName) + "-" + volume))
.addLiteral(RDFS.label, volume)
.addProperty(RDF.type, model.getProperty(schUri.apply("Volume")));
journal.put(volume, volumeResource);
model.add(volumeResource, propBelongsTo, journalResource);
journalResources.put(journalName, new Journal(journalResource));
} else {
Journal journal = journalResources.get(journalName);
Resource journalResource = journal.getJournal();
if (!journal.contains(volume)) {
volumeResource = model.createResource(dataUri.apply(cu(journalName) + "-" + volume))
.addLiteral(RDFS.label, volume)
.addProperty(RDF.type, model.getProperty(schUri.apply("Volume")));
model.add(volumeResource, propBelongsTo, journalResource);
journal.put(volume, volumeResource);
} else {
volumeResource = journal.getVolume(volume);
}
}
articleResources.put(articleName, articleResource);
model.add(articleResource, propPublishedIn, volumeResource);
} catch (Exception e) {}
});
}
Map<String, Resource> proceedingsResources = new HashMap<>();
try (InputStreamReader isr = new InputStreamReader(Files.newInputStream(Paths.get("dblp_proceedings.csv")));
BufferedReader br = new BufferedReader(isr)) {
//final Property propKey = model.createProperty(schUri.apply("key"));
final Property propFrom = model.createProperty(schUri.apply("from"));
br.lines().skip(1).forEach(l -> {
try {
String[] columns = l.split(";");
String conferenceName = columns[3];
String proceedingsTitle = columns[28];
String proceedingsKey = columns[15];
if ("".equals(conferenceName) || "".equals(proceedingsTitle)) return;
Resource conferenceResource;
if (!conferenceResources.containsKey(conferenceName)) {
conferenceResource = model.createResource(dataUri.apply(cu(conferenceName)))
.addProperty(RDF.type, model.getResource(schUri.apply("Conference")))
.addLiteral(RDFS.label, conferenceName);
conferenceResources.put(conferenceName, conferenceResource);
} else {
conferenceResource = conferenceResources.get(conferenceName);
}
if (cu(proceedingsTitle).equals(cu(conferenceName)) || proceedingsTitle.equals(conferenceName) ||
(proceedingsResources.containsKey(conferenceName) || conferenceResources.containsKey(proceedingsTitle))) {
proceedingsTitle = proceedingsTitle.concat(" (Proceedings)");
}
Resource proceedingsResource = model.createResource(dataUri.apply(cu(proceedingsTitle)))
//.addProperty(propKey, proceedingsKey)
.addProperty(RDF.type, model.getResource(schUri.apply("Proceedings")))
.addLiteral(RDFS.label, proceedingsTitle);
proceedingsResources.put(proceedingsKey, proceedingsResource);
model.add(proceedingsResource, propFrom, conferenceResource);
} catch (Exception e) {}
});
}
try (InputStreamReader isr = new InputStreamReader(Files.newInputStream(Paths.get("dblp_inproceedings.csv")));
BufferedReader br = new BufferedReader(isr)) {
br.lines().skip(1).limit(1000).forEach(l -> {
try {
String [] columns = l.split(";");
String proceedingsKey = columns[7];
String articleName = columns[23];
if ("".equals(proceedingsKey) || "".equals(articleName)) return;
if (!proceedingsResources.containsKey(proceedingsKey)) return;
Resource articleResource = model.createResource(dataUri.apply(toUnicode(cu(articleName))))
.addLiteral(RDFS.label, articleName)
.addProperty(RDF.type, model.getProperty(schUri.apply("Paper")));
for (String authorName : columns[1].split("\\|")) {
String id = toUnicode(cu(authorName));
Resource authorResource = null;
if (!authorResources.containsKey(id)) {
authorResource = model.createResource(dataUri.apply(id))
.addProperty(RDFS.label, authorName)
.addProperty(RDF.type, model.getProperty(schUri.apply("Person")));
authorResources.put(id, authorResource);
} else {
authorResource = authorResources.get(id);
}
model.add(authorResource, propWrites, articleResource);
}
model.add(articleResource, propPublishedIn, proceedingsResources.get(proceedingsKey));
} catch (Exception e) {}
});
}
journalResources.forEach((k, j) -> {
j.forEachResource(r -> {
Resource author = authorResources.values().toArray(new Resource [0])[random.nextInt(authorResources.size())];
model.add(author, propEdits, r);
});
});
articleResources.forEach((k, r) -> {
for (int i = 0; i < random.nextInt(2) + 3; i++) {
Resource author = authorResources.values().toArray(new Resource [0])[random.nextInt(authorResources.size())];
model.add(author, propReviews, r);
}
});
conferenceResources.forEach((k, r) -> {
for (int i = 0; i < random.nextInt(2) + 3; i++) {
Resource author = authorResources.values().toArray(new Resource [0])[random.nextInt(authorResources.size())];
model.add(author, propSpeaksAt, r);
}
});
model.write(new FileWriter(new File("abox.xml")), "RDF/XML");
}
private static class Journal {
private Resource journalResource;
private Map<Integer, Resource> volumeResources = new HashMap<>();
private Journal(Resource journalResource) {
this.journalResource = journalResource;
}
private void put(Integer key, Resource volumeResource) {
volumeResources.put(key, volumeResource);
}
private boolean contains(Integer key) {
return volumeResources.containsKey(key);
}
private Resource getJournal() {
return this.journalResource;
}
private Resource getVolume(Integer key) {
return volumeResources.get(key);
}
private void forEachResource(Consumer<Resource> consumer) {
volumeResources.forEach((k, v) -> consumer.accept(v));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment