Created
March 31, 2017 16:39
-
-
Save davidread/571d9264d9b64e3c82dc46352d6de6a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Load some test CSV using pgloader''' | |
import argparse | |
import subprocess | |
from pprint import pprint | |
import os.path | |
import sqlalchemy | |
import messytables | |
import ckanext.datastore.db as db | |
import ckanext.datastore.helpers as datastore_helpers | |
args = None | |
TYPES = [messytables.StringType, messytables.DecimalType, | |
messytables.IntegerType, messytables.DateUtilType] | |
TYPE_MAPPING = { | |
'String': 'text', | |
# 'int' may not be big enough, | |
# and type detection may not realize it needs to be big | |
'Integer': 'numeric', | |
'Decimal': 'numeric', | |
'DateUtil': 'timestamp' | |
} | |
res_id = 'test_res' | |
postgres_url = \ | |
'postgresql://ckan_default:pass@localhost/datastore_default' | |
# config['ckan.datastore.write_url'] | |
def load(csv_filepath): | |
f = open(csv_filepath, 'r') | |
table_set = messytables.any_tableset(f, extension='csv') | |
row_set = table_set.tables.pop() | |
csv_dialect = row_set._dialect() | |
offset, headers = messytables.headers_guess(row_set.sample) | |
# Some headers might have been converted from strings to floats and such. | |
headers = [unicode(header) for header in headers] | |
row_set.register_processor(messytables.headers_processor(headers)) | |
row_set.register_processor(messytables.offset_processor(offset + 1)) | |
types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) | |
headers = [header.strip() for header in headers if header.strip()] | |
headers_set = set(headers) | |
# row_set.register_processor(messytables.types_processor(types)) | |
# def row_iterator(): | |
# for row in row_set: | |
# data_row = {} | |
# for index, cell in enumerate(row): | |
# column_name = cell.column.strip() | |
# if column_name not in headers_set: | |
# continue | |
# data_row[column_name] = cell.value | |
# yield data_row | |
# result = row_iterator() | |
headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) | |
for field in zip(headers, types)] | |
print 'headers_dicts: ' | |
pprint(headers_dicts) | |
# Delete existing datastore resource before proceeding | |
def get_all_resources_ids_in_datastore(): | |
data_dict = {'connection_url': postgres_url} | |
resources_sql = sqlalchemy.text(u'''SELECT name FROM "_table_metadata" | |
WHERE alias_of IS NULL''') | |
query = db._get_engine(data_dict).execute(resources_sql) | |
return [q[0] for q in query.fetchall()] | |
all_res_ids = get_all_resources_ids_in_datastore() | |
context = {} | |
if res_id in all_res_ids: | |
print 'Dropping table: {}'.format(res_id) | |
data_dict = { | |
'connection_url': postgres_url, | |
'resource_id': res_id, | |
} | |
db.delete(context, data_dict) | |
# Create datastore resource | |
print 'Creating table: {}'.format(res_id) | |
data_dict = { | |
'connection_url': postgres_url, | |
'resource_id': res_id, | |
'fields': headers_dicts, | |
} | |
if False: | |
#result = db.create(context, data_dict) | |
pass | |
else: | |
# avoid creating index for now | |
engine = db._get_engine(data_dict) | |
context['connection'] = engine.connect() | |
result = db.create_table(context, data_dict) | |
# Maintain data dictionaries from matching column names | |
existing = False # columns are all correct as we just created them all | |
if existing: | |
existing_info = dict( | |
(f['id'], f['info']) | |
for f in existing.get('fields', []) if 'info' in f) | |
for h in headers_dicts: | |
if h['id'] in existing_info: | |
h['info'] = existing_info[h['id']] | |
print('Header offset: {}'.format(offset)) | |
print('Headers: {}'.format(headers)) | |
print('Types: {}'.format(types)) | |
# print('Headers and types: {}'.format(headers_dicts)) | |
# cmd = [ | |
# 'pgloader', | |
# '--type', 'csv', | |
# '--field', "'{}'".format(','.join(headers)), | |
# '--with', 'skip header = {}'.format(offset + 1), # --with "skip header = 1" | |
# #'--with', 'drop indexes', # doesn't work due to numbers not quoted in drop | |
# csv_filepath, | |
# postgres_url + postgres_table_param, | |
# ] | |
# print ' '.join(cmd) | |
skip_header_rows = offset + 1 | |
pgloader_command_file_buffer = get_csv_pgloader_command_file_buffer( | |
postgres_url, res_id, | |
csv_filepath, headers, skip_header_rows, csv_dialect) | |
pgloader_options_filepath = '/tmp/pgloader_options' | |
with open(pgloader_options_filepath, 'w') as f: | |
f.write(pgloader_command_file_buffer) | |
print pgloader_command_file_buffer | |
cmd = ['pgloader', pgloader_options_filepath] | |
print ' '.join(cmd) | |
result = subprocess.call(cmd) | |
print 'Result: ', result | |
def get_csv_pgloader_command_file_buffer( | |
postgres_url, table_name, | |
csv_filepath, header_names, skip_header_rows, csv_dialect): | |
postgres_table_param = '?tablename={}'.format(table_name) | |
# We need to call identifier to double quote the field names, since | |
# that makes them case sensitive, as was done when creating the columns | |
fields = ','.join(datastore_helpers.identifier(field_name) | |
for field_name in header_names) | |
options = ''' | |
LOAD CSV | |
FROM '{filepath}' | |
HAVING FIELDS ({fields}) | |
INTO {postgres_url} | |
TARGET COLUMNS ({target_fields}) | |
WITH skip header = {skip_header}, | |
batch rows = 10000, | |
drop indexes, | |
quote identifiers, | |
fields terminated by '{delimiter}', | |
fields optionally enclosed by '{quote_char}', | |
lines terminated by '{line_terminator}' | |
;'''.format( | |
filepath=os.path.abspath(csv_filepath), | |
fields=fields, | |
postgres_url=postgres_url + postgres_table_param, | |
target_fields=fields, | |
skip_header=skip_header_rows, | |
delimiter=csv_dialect.delimiter, | |
quote_char=csv_dialect.quotechar, | |
line_terminator=csv_dialect.lineterminator, | |
) | |
return options | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('csv_filepath') | |
args = parser.parse_args() | |
load(args.csv_filepath) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment