Skip to content

Instantly share code, notes, and snippets.

@galvez
Created June 24, 2016 07:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save galvez/8076ea195cfda4e30eb7bbd336833476 to your computer and use it in GitHub Desktop.
Save galvez/8076ea195cfda4e30eb7bbd336833476 to your computer and use it in GitHub Desktop.
import csv
import itertools
def match(file, fields):
records = {}
parsed = list(csv.reader(open(file, 'rU')))
columns = parsed[0]
indexes = {}
index_fields = list(itertools.chain(*[
f for f in fields if type(f) == list
]))
index_fields += [
f for f in fields if type(f) != list
]
for index_field in index_fields:
indexes[index_field] = {}
for index, record in enumerate(parsed[1:]):
records[index+1] = {}
for column, value in zip(columns, record):
records[index+1][column] = value
if column in indexes:
if not value in indexes[column]:
indexes[column][value] = index+1
records[index+1]['id'] = index+1
#print(records)
#return
for id, record in records.items():
nid = None
for field in fields:
if type(field) is list:
for sfield in field:
if record[sfield] in indexes[sfield]:
nid = indexes[sfield][record[sfield]]
else:
if record[field] in indexes[field]:
nid = indexes[field][record[field]]
if nid is not None and id != nid:
print(nid)
records[id]['id'] = nid
result = [(','.join(['ID'] + columns))]
for record in records.values():
result.append((','.join([str(record['id'])] + [record[c] for c in columns])))
return result
# FirstName,LastName,Phone1,Phone2,Email1,Email2,Zip
def test_same_email():
print('\n'.join(match('input1.csv', ['Email'])))
print('\n'.join(match('input2.csv', [['Email1', 'Email2']])))
print('\n'.join(match('input3.csv', [['Email1', 'Email2']])))
def test_same_phone():
print('\n'.join(match('input1.csv', ['Phone'])))
print('\n'.join(match('input2.csv', [['Phone1', 'Phone2']])))
print('\n'.join(match('input3.csv', [['Phone1', 'Phone2']])))
def test_same_phone_and_email():
print('\n'.join(match('input1.csv', ['Phone', 'Email'])))
print('\n'.join(match('input2.csv', [['Phone1', 'Phone2'], ['Email1', 'Email2']])))
print('\n'.join(match('input3.csv', [['Phone1', 'Phone2'], ['Email1', 'Email2']])))
if __name__ == '__main__':
_globals = globals().keys()
for func in _globals:
v = globals()[func]
if callable(v) and func.startswith('test'):
v()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment