lukerosiak/OLMS

## OLMS
import os
import psycopg2

#IMPORT AND UNZIL ALL YEARS OF FILES INTO THIS DIRECTORY
#SET VARIABLES IN THE NEXT 3 LINES
path = '/media/sf_bulk/labor/data/'
years = [str(x) for x in range(2000,2013)]

conn = psycopg2.connect(database="labor", user="", password="")

#conn.autocommit = True
cur = conn.cursor()


for f in os.listdir(os.path.join(path,years[-1])):
    if f.endswith('_meta.txt'):
        name = f[:-len('_meta.txt')]
        print 'processing %s' % name
        cur.execute('DROP TABLE IF EXISTS %s;' % name)
        sql = 'CREATE TABLE %s (\n' % name
        fin =  open( os.path.join(path,years[-1],f), 'r').readlines()
        sqllines = []
        for line in fin[2:]:
            vartype = line[63:].strip()
            if vartype.upper()=='CHAR': vartype = 'VARCHAR'
            null = line[54:63].strip()
            col = line[:50].strip()
            sqllines.append('   %s %s' % (col, vartype)) #add null if you want to enforce not-null
        sql = sql + ',\n'.join(sqllines) + ');'
        cur.execute(sql)
        conn.commit()
        numfields = len(sqllines)


        for year in years:


            if name=='ar_erds_codes' and year!=years[-1]:
                break


            filename = '%s_data_%s.txt' % (name,year)
            fn = os.path.join(path,year,filename)
            fdetail = open(fn,'r')


            if name not in ['ar_assets_fixed','ar_assets_investments','ar_assets_loans_rcvbl','ar_assets_other','ar_disbursements_genrl',
                'ar_disbursements_inv_purchases','ar_payer_payee','ar_rates_dues_fees','ar_receipts_inv_fa_sales',
                'ar_receipts_other','lm_data']:

                print year
                cur.copy_expert("COPY %s FROM STDIN WITH DELIMITER '|' QUOTE E'\b' CSV HEADER" % name, fdetail)


            else:

                fdetail = open( fn,'r').readlines()
                #deal with newlines inside fields
                detail = []
                existing = ''
                for line in fdetail[1:]:
                    existing = existing + line
                    if len(existing.split('|'))==numfields:
                        detail.append(existing)
                        existing = ''
                    elif len(existing.split('|'))>numfields:
                        print '%s ERROR: tried to combine lines, but too many fields' % year
                        print existing
                        existing = ''

                print '%s: %s rows' % (year,len(detail))

                for line in detail:
                    fields_raw = line.strip().split('|')
                    fields = []
                    for field in fields_raw:
                        if field=='':
                            fields.append(None)
                        else:
                            fields.append(field)
                    blanks = ','.join( ['%s' for x in fields])
                    try:
                        cur.execute('INSERT INTO ' + name + ' VALUES (' + blanks + ');', fields)
                    except psycopg2.DataError as e:
                        print year
                        print e

                conn.commit()

        if 'rpt_id' in sql.lower():
            cur.execute('CREATE INDEX %s_rpt_id_idx ON %s (rpt_id);' % (name,name))
            conn.commit()


"""
alter table lm_data add column next_election_date date;
update lm_data set next_election=trim(next_election);
update lm_data set next_election=replace(next_election,'/','') where next_election like '%/%' and length(next_election)=7;
update lm_data set next_election=concat('0',replace(next_election,'/','')) where next_election like '%/%' and length(next_election)=6;
update lm_data set next_election_date=to_date(next_election,'MM/DD/YYYY') where length(next_election)=10 and next_election like '%/%/%';
update lm_data set next_election_date=to_date(next_election,'MMYYYY') where next_election ~ e'^\\d{6}$';
"""
	import os
	import psycopg2

	#IMPORT AND UNZIL ALL YEARS OF FILES INTO THIS DIRECTORY
	#SET VARIABLES IN THE NEXT 3 LINES
	path = '/media/sf_bulk/labor/data/'
	years = [str(x) for x in range(2000,2013)]

	conn = psycopg2.connect(database="labor", user="", password="")

	#conn.autocommit = True
	cur = conn.cursor()



	for f in os.listdir(os.path.join(path,years[-1])):
	if f.endswith('_meta.txt'):
	name = f[:-len('_meta.txt')]
	print 'processing %s' % name
	cur.execute('DROP TABLE IF EXISTS %s;' % name)
	sql = 'CREATE TABLE %s (\n' % name
	fin = open( os.path.join(path,years[-1],f), 'r').readlines()
	sqllines = []
	for line in fin[2:]:
	vartype = line[63:].strip()
	if vartype.upper()=='CHAR': vartype = 'VARCHAR'
	null = line[54:63].strip()
	col = line[:50].strip()
	sqllines.append(' %s %s' % (col, vartype)) #add null if you want to enforce not-null
	sql = sql + ',\n'.join(sqllines) + ');'
	cur.execute(sql)
	conn.commit()
	numfields = len(sqllines)


	for year in years:


	if name=='ar_erds_codes' and year!=years[-1]:
	break


	filename = '%s_data_%s.txt' % (name,year)
	fn = os.path.join(path,year,filename)
	fdetail = open(fn,'r')


	if name not in ['ar_assets_fixed','ar_assets_investments','ar_assets_loans_rcvbl','ar_assets_other','ar_disbursements_genrl',
	'ar_disbursements_inv_purchases','ar_payer_payee','ar_rates_dues_fees','ar_receipts_inv_fa_sales',
	'ar_receipts_other','lm_data']:

	print year
	cur.copy_expert("COPY %s FROM STDIN WITH DELIMITER '\|' QUOTE E'\b' CSV HEADER" % name, fdetail)


	else:

	fdetail = open( fn,'r').readlines()
	#deal with newlines inside fields
	detail = []
	existing = ''
	for line in fdetail[1:]:
	existing = existing + line
	if len(existing.split('\|'))==numfields:
	detail.append(existing)
	existing = ''
	elif len(existing.split('\|'))>numfields:
	print '%s ERROR: tried to combine lines, but too many fields' % year
	print existing
	existing = ''

	print '%s: %s rows' % (year,len(detail))

	for line in detail:
	fields_raw = line.strip().split('\|')
	fields = []
	for field in fields_raw:
	if field=='':
	fields.append(None)
	else:
	fields.append(field)
	blanks = ','.join( ['%s' for x in fields])
	try:
	cur.execute('INSERT INTO ' + name + ' VALUES (' + blanks + ');', fields)
	except psycopg2.DataError as e:
	print year
	print e

	conn.commit()

	if 'rpt_id' in sql.lower():
	cur.execute('CREATE INDEX %s_rpt_id_idx ON %s (rpt_id);' % (name,name))
	conn.commit()


	"""
	alter table lm_data add column next_election_date date;
	update lm_data set next_election=trim(next_election);
	update lm_data set next_election=replace(next_election,'/','') where next_election like '%/%' and length(next_election)=7;
	update lm_data set next_election=concat('0',replace(next_election,'/','')) where next_election like '%/%' and length(next_election)=6;
	update lm_data set next_election_date=to_date(next_election,'MM/DD/YYYY') where length(next_election)=10 and next_election like '%/%/%';
	update lm_data set next_election_date=to_date(next_election,'MMYYYY') where next_election ~ e'^\\d{6}$';
	"""