-
-
Save lukerosiak/4717157 to your computer and use it in GitHub Desktop.
Import OLMS into Postgres
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import psycopg2 | |
#IMPORT AND UNZIL ALL YEARS OF FILES INTO THIS DIRECTORY | |
#SET VARIABLES IN THE NEXT 3 LINES | |
path = '/media/sf_bulk/labor/data/' | |
years = [str(x) for x in range(2000,2013)] | |
conn = psycopg2.connect(database="labor", user="", password="") | |
#conn.autocommit = True | |
cur = conn.cursor() | |
for f in os.listdir(os.path.join(path,years[-1])): | |
if f.endswith('_meta.txt'): | |
name = f[:-len('_meta.txt')] | |
print 'processing %s' % name | |
cur.execute('DROP TABLE IF EXISTS %s;' % name) | |
sql = 'CREATE TABLE %s (\n' % name | |
fin = open( os.path.join(path,years[-1],f), 'r').readlines() | |
sqllines = [] | |
for line in fin[2:]: | |
vartype = line[63:].strip() | |
if vartype.upper()=='CHAR': vartype = 'VARCHAR' | |
null = line[54:63].strip() | |
col = line[:50].strip() | |
sqllines.append(' %s %s' % (col, vartype)) #add null if you want to enforce not-null | |
sql = sql + ',\n'.join(sqllines) + ');' | |
cur.execute(sql) | |
conn.commit() | |
numfields = len(sqllines) | |
for year in years: | |
if name=='ar_erds_codes' and year!=years[-1]: | |
break | |
filename = '%s_data_%s.txt' % (name,year) | |
fn = os.path.join(path,year,filename) | |
fdetail = open(fn,'r') | |
if name not in ['ar_assets_fixed','ar_assets_investments','ar_assets_loans_rcvbl','ar_assets_other','ar_disbursements_genrl', | |
'ar_disbursements_inv_purchases','ar_payer_payee','ar_rates_dues_fees','ar_receipts_inv_fa_sales', | |
'ar_receipts_other','lm_data']: | |
print year | |
cur.copy_expert("COPY %s FROM STDIN WITH DELIMITER '|' QUOTE E'\b' CSV HEADER" % name, fdetail) | |
else: | |
fdetail = open( fn,'r').readlines() | |
#deal with newlines inside fields | |
detail = [] | |
existing = '' | |
for line in fdetail[1:]: | |
existing = existing + line | |
if len(existing.split('|'))==numfields: | |
detail.append(existing) | |
existing = '' | |
elif len(existing.split('|'))>numfields: | |
print '%s ERROR: tried to combine lines, but too many fields' % year | |
print existing | |
existing = '' | |
print '%s: %s rows' % (year,len(detail)) | |
for line in detail: | |
fields_raw = line.strip().split('|') | |
fields = [] | |
for field in fields_raw: | |
if field=='': | |
fields.append(None) | |
else: | |
fields.append(field) | |
blanks = ','.join( ['%s' for x in fields]) | |
try: | |
cur.execute('INSERT INTO ' + name + ' VALUES (' + blanks + ');', fields) | |
except psycopg2.DataError as e: | |
print year | |
print e | |
conn.commit() | |
if 'rpt_id' in sql.lower(): | |
cur.execute('CREATE INDEX %s_rpt_id_idx ON %s (rpt_id);' % (name,name)) | |
conn.commit() | |
""" | |
alter table lm_data add column next_election_date date; | |
update lm_data set next_election=trim(next_election); | |
update lm_data set next_election=replace(next_election,'/','') where next_election like '%/%' and length(next_election)=7; | |
update lm_data set next_election=concat('0',replace(next_election,'/','')) where next_election like '%/%' and length(next_election)=6; | |
update lm_data set next_election_date=to_date(next_election,'MM/DD/YYYY') where length(next_election)=10 and next_election like '%/%/%'; | |
update lm_data set next_election_date=to_date(next_election,'MMYYYY') where next_election ~ e'^\\d{6}$'; | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment