Skip to content

Instantly share code, notes, and snippets.

@lukerosiak
Created February 5, 2013 20:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukerosiak/4717157 to your computer and use it in GitHub Desktop.
Save lukerosiak/4717157 to your computer and use it in GitHub Desktop.
Import OLMS into Postgres
import os
import psycopg2
#IMPORT AND UNZIL ALL YEARS OF FILES INTO THIS DIRECTORY
#SET VARIABLES IN THE NEXT 3 LINES
path = '/media/sf_bulk/labor/data/'
years = [str(x) for x in range(2000,2013)]
conn = psycopg2.connect(database="labor", user="", password="")
#conn.autocommit = True
cur = conn.cursor()
for f in os.listdir(os.path.join(path,years[-1])):
if f.endswith('_meta.txt'):
name = f[:-len('_meta.txt')]
print 'processing %s' % name
cur.execute('DROP TABLE IF EXISTS %s;' % name)
sql = 'CREATE TABLE %s (\n' % name
fin = open( os.path.join(path,years[-1],f), 'r').readlines()
sqllines = []
for line in fin[2:]:
vartype = line[63:].strip()
if vartype.upper()=='CHAR': vartype = 'VARCHAR'
null = line[54:63].strip()
col = line[:50].strip()
sqllines.append(' %s %s' % (col, vartype)) #add null if you want to enforce not-null
sql = sql + ',\n'.join(sqllines) + ');'
cur.execute(sql)
conn.commit()
numfields = len(sqllines)
for year in years:
if name=='ar_erds_codes' and year!=years[-1]:
break
filename = '%s_data_%s.txt' % (name,year)
fn = os.path.join(path,year,filename)
fdetail = open(fn,'r')
if name not in ['ar_assets_fixed','ar_assets_investments','ar_assets_loans_rcvbl','ar_assets_other','ar_disbursements_genrl',
'ar_disbursements_inv_purchases','ar_payer_payee','ar_rates_dues_fees','ar_receipts_inv_fa_sales',
'ar_receipts_other','lm_data']:
print year
cur.copy_expert("COPY %s FROM STDIN WITH DELIMITER '|' QUOTE E'\b' CSV HEADER" % name, fdetail)
else:
fdetail = open( fn,'r').readlines()
#deal with newlines inside fields
detail = []
existing = ''
for line in fdetail[1:]:
existing = existing + line
if len(existing.split('|'))==numfields:
detail.append(existing)
existing = ''
elif len(existing.split('|'))>numfields:
print '%s ERROR: tried to combine lines, but too many fields' % year
print existing
existing = ''
print '%s: %s rows' % (year,len(detail))
for line in detail:
fields_raw = line.strip().split('|')
fields = []
for field in fields_raw:
if field=='':
fields.append(None)
else:
fields.append(field)
blanks = ','.join( ['%s' for x in fields])
try:
cur.execute('INSERT INTO ' + name + ' VALUES (' + blanks + ');', fields)
except psycopg2.DataError as e:
print year
print e
conn.commit()
if 'rpt_id' in sql.lower():
cur.execute('CREATE INDEX %s_rpt_id_idx ON %s (rpt_id);' % (name,name))
conn.commit()
"""
alter table lm_data add column next_election_date date;
update lm_data set next_election=trim(next_election);
update lm_data set next_election=replace(next_election,'/','') where next_election like '%/%' and length(next_election)=7;
update lm_data set next_election=concat('0',replace(next_election,'/','')) where next_election like '%/%' and length(next_election)=6;
update lm_data set next_election_date=to_date(next_election,'MM/DD/YYYY') where length(next_election)=10 and next_election like '%/%/%';
update lm_data set next_election_date=to_date(next_election,'MMYYYY') where next_election ~ e'^\\d{6}$';
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment