Skip to content

Instantly share code, notes, and snippets.

@tony
Created May 13, 2017 00:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tony/0218d6dfe51e37fc02f697a9405382a8 to your computer and use it in GitHub Desktop.
Save tony/0218d6dfe51e37fc02f697a9405382a8 to your computer and use it in GitHub Desktop.
create datapackage for unihan-tabular
#!/usr/bin/env python
# -*- coding: utf8 -*-
"""
Based off http://frictionlessdata.io/guides/creating-tabular-data-packages-in-python/
For use on https://github.com/cihai/unihan-tabular
License: MIT
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals, with_statement)
import io
import csv
import os
from jsontableschema import infer
import datapackage
# this and name/title descriptors can be replaced with your package name/title
about = {}
about_file = os.path.join(
os.path.dirname(__file__), 'unihan_tabular', '__about__.py')
with open(about_file) as fp:
exec(fp.read(), about)
dp = datapackage.DataPackage()
dp.descriptor['name'] = about['__title__']
dp.descriptor['title'] = about['__title__']
filepath = './data/unihan.csv'
# On Python 2, this would crash due to poor Unicode support (was using this
# on a unicode-rich CSV. Python 3 is slow, so had to chop off the top
with io.open(filepath) as stream:
headers = stream.readline().rstrip('\n').split(',')
values = csv.reader(stream)
schema = infer(headers, values)
dp.descriptor['resources'] = [
{
'name': 'data',
'path': filepath,
'schema': schema
}
]
# datapackage Version: 0.8.8 would wrongly attribute a date-like type
# to many fields, so had to find-replace them with "string"
with open('datapackage.json', 'w') as f:
f.write(dp.to_json())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment