Skip to content

Instantly share code, notes, and snippets.

@symroe
Created August 5, 2010 13:47
Show Gist options
  • Save symroe/509753 to your computer and use it in GitHub Desktop.
Save symroe/509753 to your computer and use it in GitHub Desktop.
# encoding: utf-8
import re
import csv
import sys
out_file = csv.writer(sys.stdout)
c = csv.reader(sys.stdin, delimiter=',')
def per_hour_to_annum(value):
value = value*40*52
return value
for r in c:
if len(r) > 5:
# initial cleaning
orig_col = r[5]
col = r[5].lower()
col = re.sub(r'([\d])k', r'\1,000', col)
if re.search('£', col):
col = re.search('(£.*$)', col).group(0)
# print col
# pass
col = re.sub('£', '', col)
# Massive if statement trying to clean everything
if re.search('per hour', col):
# col = re.sub(r' ', '', col)
col = re.sub(r'([a-zA-Z]+)', '', col)
col = re.sub(r'^-', '', col)
col = re.sub(r'%', '', col)
x_col = col
col = re.split('-', col)[0]
col = col.strip()
col = re.sub('\+', '', col)
if re.match('[0-9]', col):
col = re.split(' ', col)[0]
col = re.split('/', col)[0]
# print col
try:
col = per_hour_to_annum(float(col))
except Exception, e:
pass
elif re.search('per annum|pa |p\/a', col):
if re.match('^[0-9]', col):
col = re.split(' ', col)[0]
col = re.split('Â|-', col)[0]
col = re.sub(r'[^0-9\.]', '', col)
# If there is still anything with 2 chars, assume it should have more!
if len(col) <= 2:
col = "%s000" % col
# sometimes people use dots in the wrong way:
if re.search(r'^([0-9]{0,2})\.', col):
col = re.sub(r'^([0-9]{0,2}).', r'\1', col)
col = re.sub('[a-z-A-Z]', '', col)
else:
col = re.split(' ', col)[0]
col = re.sub('[a-zA-Z]', '', col)
col = re.sub(',', '', col)
col = re.sub('[^0-9\.]', '', col)
if len(str(col)) > 0:
try:
col = float(col)
if col:
r[5] = col
r.insert(6, orig_col)
out_file.writerow(r)
except:
print repr(col)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment