Skip to content

Instantly share code, notes, and snippets.

@anisotropi4
Last active April 29, 2017 17:31
Show Gist options
  • Save anisotropi4/7c62a5961e22765800383fd0f2cc2fdd to your computer and use it in GitHub Desktop.
Save anisotropi4/7c62a5961e22765800383fd0f2cc2fdd to your computer and use it in GitHub Desktop.
A python3 script create a PostgreSQL import script based on column names in the header of a tsv file-format
#!/usr/bin/python3
import csv
import sys
import os
import re
filename = sys.argv[1]
tablename = filename
#[1/3] For csv file uncomment the next line and comment out tsv
#tablename = re.sub(r'\.csv$', '', tablename)
tablename = re.sub(r'\.tsv$', '', tablename)
tablename = re.sub(r'[ \.()\/\-]', '_', tablename)
tablename = re.sub(r'^', 'table_', tablename)
instances = {}
attributes = []
unique = True
s1_re = re.compile('[ ()\/\-\\%\]\[]+')
s2_re = re.compile('[\.:\?%]')
s3_re = re.compile('#')
s4_re = re.compile('£')
s5_re = re.compile('^_+')
s6_re = re.compile('_+$')
print(tablename)
with open(filename) as csvfile:
#[2/3] For csv file uncomment the next line and comment out tsv
#reader = csv.reader(csvfile, delimiter=',', quotechar='"')
reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
for fields in reader:
count = 0
for field in fields:
field = s1_re.sub('_',field.lower())
field = s2_re.sub('',field)
field = s3_re.sub('n',field)
field = s4_re.sub('GBP',field)
field = s5_re.sub('',field)
field = s6_re.sub('',field)
if field == "":
field = "empty_"+str(len(attributes))
if field in instances:
unique = False
instances[field] += 1
else:
instances[field] = 1
attributes.append(field)
break
if unique:
with open(tablename+'.sql','w+') as sqlfile:
sqlfile.write('drop table {} cascade;\n'.format(tablename))
sqlfile.write('create table {} ('.format(tablename))
sqlfile.write(' varchar, '.join(attributes)+' varchar);\n')
#[3/3] For csv file uncomment the next line and comment out tsv
#sqlfile.write("\\copy {0} from '{1}/{2}' csv header\n".format(tablename,os.getcwd(),filename))
sqlfile.write("\\copy {0} from '{1}/{2}' delimiter as '\t' csv header\n".format(tablename,os.getcwd(),filename))
else:
print('**duplicate fields')
for field in attributes:
print('{0},{1}'.format(field,instances[field]))
print('')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment