Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Read triples dumped from Jena SDB via SQL and convert to n-triples
"""
Used chardet library to find encoding - it reported:
{'confidence': 0.7158930206047512, 'encoding': 'ISO-8859-2'}
Use line writing to print ntriples as they are read to avoid
memory issues with storing all statements in an in memory
RDFLib Graph.
"""
import csv
import sys
from rdflib import Graph
from rdflib import Literal, URIRef
def clean_uri(raw):
return raw.lstrip('<').rstrip('>')
def print_triple(s, p, o):
g = Graph()
g.add((s, p, o))
print g.serialize(format='nt').strip('\n')
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
"""
http://stackoverflow.com/a/904085/758157
"""
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
#Decode and then encode as utf8
#decode('iso-8859-1').encode('utf8')
#http://stackoverflow.com/a/6539919/758157
yield [cell.decode('iso-8859-1').encode('utf8') for cell in row]
with open(sys.argv[1]) as infile:
reader = unicode_csv_reader(infile, delimiter="\t")
# skip header
reader.next()
for row in reader:
s, p, o = row
subj = URIRef(clean_uri(s))
pred = URIRef(clean_uri(p))
try:
obj = Literal(o)
except Exception:
raise
print_triple(subj, pred, obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment