Created
August 5, 2010 13:47
-
-
Save symroe/509753 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
import re | |
import csv | |
import sys | |
out_file = csv.writer(sys.stdout) | |
c = csv.reader(sys.stdin, delimiter=',') | |
def per_hour_to_annum(value): | |
value = value*40*52 | |
return value | |
for r in c: | |
if len(r) > 5: | |
# initial cleaning | |
orig_col = r[5] | |
col = r[5].lower() | |
col = re.sub(r'([\d])k', r'\1,000', col) | |
if re.search('£', col): | |
col = re.search('(£.*$)', col).group(0) | |
# print col | |
# pass | |
col = re.sub('£', '', col) | |
# Massive if statement trying to clean everything | |
if re.search('per hour', col): | |
# col = re.sub(r' ', '', col) | |
col = re.sub(r'([a-zA-Z]+)', '', col) | |
col = re.sub(r'^-', '', col) | |
col = re.sub(r'%', '', col) | |
x_col = col | |
col = re.split('-', col)[0] | |
col = col.strip() | |
col = re.sub('\+', '', col) | |
if re.match('[0-9]', col): | |
col = re.split(' ', col)[0] | |
col = re.split('/', col)[0] | |
# print col | |
try: | |
col = per_hour_to_annum(float(col)) | |
except Exception, e: | |
pass | |
elif re.search('per annum|pa |p\/a', col): | |
if re.match('^[0-9]', col): | |
col = re.split(' ', col)[0] | |
col = re.split('Â|-', col)[0] | |
col = re.sub(r'[^0-9\.]', '', col) | |
# If there is still anything with 2 chars, assume it should have more! | |
if len(col) <= 2: | |
col = "%s000" % col | |
# sometimes people use dots in the wrong way: | |
if re.search(r'^([0-9]{0,2})\.', col): | |
col = re.sub(r'^([0-9]{0,2}).', r'\1', col) | |
col = re.sub('[a-z-A-Z]', '', col) | |
else: | |
col = re.split(' ', col)[0] | |
col = re.sub('[a-zA-Z]', '', col) | |
col = re.sub(',', '', col) | |
col = re.sub('[^0-9\.]', '', col) | |
if len(str(col)) > 0: | |
try: | |
col = float(col) | |
if col: | |
r[5] = col | |
r.insert(6, orig_col) | |
out_file.writerow(r) | |
except: | |
print repr(col) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment