Created
June 1, 2015 13:03
-
-
Save lawlesst/b64acd55a26a130cebd9 to your computer and use it in GitHub Desktop.
Read triples dumped from Jena SDB via SQL and convert to n-triples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Used chardet library to find encoding - it reported: | |
{'confidence': 0.7158930206047512, 'encoding': 'ISO-8859-2'} | |
Use line writing to print ntriples as they are read to avoid | |
memory issues with storing all statements in an in memory | |
RDFLib Graph. | |
""" | |
import csv | |
import sys | |
from rdflib import Graph | |
from rdflib import Literal, URIRef | |
def clean_uri(raw): | |
return raw.lstrip('<').rstrip('>') | |
def print_triple(s, p, o): | |
g = Graph() | |
g.add((s, p, o)) | |
print g.serialize(format='nt').strip('\n') | |
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs): | |
""" | |
http://stackoverflow.com/a/904085/758157 | |
""" | |
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs) | |
for row in csv_reader: | |
#Decode and then encode as utf8 | |
#decode('iso-8859-1').encode('utf8') | |
#http://stackoverflow.com/a/6539919/758157 | |
yield [cell.decode('iso-8859-1').encode('utf8') for cell in row] | |
with open(sys.argv[1]) as infile: | |
reader = unicode_csv_reader(infile, delimiter="\t") | |
# skip header | |
reader.next() | |
for row in reader: | |
s, p, o = row | |
subj = URIRef(clean_uri(s)) | |
pred = URIRef(clean_uri(p)) | |
try: | |
obj = Literal(o) | |
except Exception: | |
raise | |
print_triple(subj, pred, obj) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment