lawlesst/read_dumped_triples.py

## read_dumped_triples.py
"""

Used chardet library to find encoding - it reported:
{'confidence': 0.7158930206047512, 'encoding': 'ISO-8859-2'}

Use line writing to print ntriples as they are read to avoid
memory issues with storing all statements in an in memory
RDFLib Graph.

"""


import csv
import sys

from rdflib import Graph
from rdflib import Literal, URIRef

def clean_uri(raw):
    return raw.lstrip('<').rstrip('>')

def print_triple(s, p, o):
    g = Graph()
    g.add((s, p, o))
    print g.serialize(format='nt').strip('\n')

def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
    """
    http://stackoverflow.com/a/904085/758157
    """
    csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
    for row in csv_reader:
        #Decode and then encode as utf8
        #decode('iso-8859-1').encode('utf8')
        #http://stackoverflow.com/a/6539919/758157
        yield [cell.decode('iso-8859-1').encode('utf8') for cell in row]

with open(sys.argv[1]) as infile:
    reader = unicode_csv_reader(infile, delimiter="\t")
    # skip header
    reader.next()
    for row in reader:
        s, p, o = row
        subj = URIRef(clean_uri(s))
        pred = URIRef(clean_uri(p))
        try:
            obj = Literal(o)
        except Exception:
            raise
        print_triple(subj, pred, obj)
	"""

	Used chardet library to find encoding - it reported:
	{'confidence': 0.7158930206047512, 'encoding': 'ISO-8859-2'}

	Use line writing to print ntriples as they are read to avoid
	memory issues with storing all statements in an in memory
	RDFLib Graph.

	"""


	import csv
	import sys

	from rdflib import Graph
	from rdflib import Literal, URIRef

	def clean_uri(raw):
	return raw.lstrip('<').rstrip('>')

	def print_triple(s, p, o):
	g = Graph()
	g.add((s, p, o))
	print g.serialize(format='nt').strip('\n')

	def unicode_csv_reader(utf8_data, dialect=csv.excel, **kwargs):
	"""
	http://stackoverflow.com/a/904085/758157
	"""
	csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs)
	for row in csv_reader:
	#Decode and then encode as utf8
	#decode('iso-8859-1').encode('utf8')
	#http://stackoverflow.com/a/6539919/758157
	yield [cell.decode('iso-8859-1').encode('utf8') for cell in row]

	with open(sys.argv[1]) as infile:
	reader = unicode_csv_reader(infile, delimiter="\t")
	# skip header
	reader.next()
	for row in reader:
	s, p, o = row
	subj = URIRef(clean_uri(s))
	pred = URIRef(clean_uri(p))
	try:
	obj = Literal(o)
	except Exception:
	raise
	print_triple(subj, pred, obj)