Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Read triples dumped from Jena SDB via SQL and convert to n-triples
Used chardet library to find encoding - it reported:
{'confidence': 0.7158930206047512, 'encoding': 'ISO-8859-2'}
Use line writing to print ntriples as they are read to avoid
memory issues with storing all statements in an in memory
RDFLib Graph.
import csv
import sys
from rdflib import Graph
from rdflib import Literal, URIRef
def clean_uri(raw):
return raw.lstrip('<').rstrip('>')
def print_triple(s, p, o):
g = Graph()
g.add((s, p, o))
print g.serialize(format='nt').strip('\n')
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
#Decode and then encode as utf8
yield [cell.decode('iso-8859-1').encode('utf8') for cell in row]
with open(sys.argv[1]) as infile:
reader = unicode_csv_reader(infile, delimiter="\t")
# skip header
for row in reader:
s, p, o = row
subj = URIRef(clean_uri(s))
pred = URIRef(clean_uri(p))
obj = Literal(o)
except Exception:
print_triple(subj, pred, obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment