Skip to content

Instantly share code, notes, and snippets.

@pudo
Created April 21, 2015 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pudo/c9d663fd8893ad453e82 to your computer and use it in GitHub Desktop.
Save pudo/c9d663fd8893ad453e82 to your computer and use it in GitHub Desktop.
Data exporter for OpenNames.org
import os
import json
import dataset
import unicodecsv
from datetime import datetime
engine = dataset.connect('postgresql://localhost/opennames.org')
account_tbl = engine['account']
accounts = {a.get('id'): a for a in account_tbl}
def hook(obj):
if isinstance(obj, datetime):
return obj.isoformat()
return obj
for dataset in engine['dataset']:
print dataset.get('name')
acct = accounts.get(dataset.get('owner_id'))
path_prefix = os.path.join('/var/www/data.pudo.org/opennames.org',
acct.get('login'), dataset.get('name'))
try:
os.makedirs(path_prefix)
except:
pass
with open(os.path.join(path_prefix, 'meta.json'), 'wb') as fh:
json.dump(dataset, fh, default=hook)
entities = engine['entity'].find(dataset_id=dataset.get('id'))
entities = {e['id']: e for e in entities}
with open(os.path.join(path_prefix, 'data.json'), 'wb') as fh:
data = dict(dataset)
data['entities'] = entities
json.dump(data, fh, default=hook)
table = []
keys = set()
for id, entity in entities.items():
row = {}
canonical_id = entity.pop('canonical_id')
for k, v in entity.items():
kk = 'entity_%s' % k
if isinstance(v, datetime):
v = v.isoformat()
if isinstance(v, dict):
v = json.dumps(v, default=hook)
row[kk] = v
if canonical_id is not None:
canonical = entities[canonical_id]
for k, v in canonical.items():
kk = 'canonical_%s' % k
if isinstance(v, datetime):
v = v.isoformat()
if isinstance(v, dict):
v = json.dumps(v, default=hook)
row[kk] = v
table.append(row)
keys.update(row.keys())
with open(os.path.join(path_prefix, '%s.csv' % dataset.get('name')), 'wb') as fh:
writer = unicodecsv.DictWriter(fh, fieldnames=keys)
writer.writeheader()
for row in table:
writer.writerow(row)
print len(accounts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment