Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Created June 1, 2015 13:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lawlesst/b64acd55a26a130cebd9 to your computer and use it in GitHub Desktop.
Save lawlesst/b64acd55a26a130cebd9 to your computer and use it in GitHub Desktop.
Read triples dumped from Jena SDB via SQL and convert to n-triples
"""
Used chardet library to find encoding - it reported:
{'confidence': 0.7158930206047512, 'encoding': 'ISO-8859-2'}
Use line writing to print ntriples as they are read to avoid
memory issues with storing all statements in an in memory
RDFLib Graph.
"""
import csv
import sys
from rdflib import Graph
from rdflib import Literal, URIRef
def clean_uri(raw):
return raw.lstrip('<').rstrip('>')
def print_triple(s, p, o):
g = Graph()
g.add((s, p, o))
print g.serialize(format='nt').strip('\n')
def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
"""
http://stackoverflow.com/a/904085/758157
"""
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
for row in csv_reader:
#Decode and then encode as utf8
#decode('iso-8859-1').encode('utf8')
#http://stackoverflow.com/a/6539919/758157
yield [cell.decode('iso-8859-1').encode('utf8') for cell in row]
with open(sys.argv[1]) as infile:
reader = unicode_csv_reader(infile, delimiter="\t")
# skip header
reader.next()
for row in reader:
s, p, o = row
subj = URIRef(clean_uri(s))
pred = URIRef(clean_uri(p))
try:
obj = Literal(o)
except Exception:
raise
print_triple(subj, pred, obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment