Created
June 24, 2016 07:00
-
-
Save galvez/8076ea195cfda4e30eb7bbd336833476 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import itertools | |
def match(file, fields): | |
records = {} | |
parsed = list(csv.reader(open(file, 'rU'))) | |
columns = parsed[0] | |
indexes = {} | |
index_fields = list(itertools.chain(*[ | |
f for f in fields if type(f) == list | |
])) | |
index_fields += [ | |
f for f in fields if type(f) != list | |
] | |
for index_field in index_fields: | |
indexes[index_field] = {} | |
for index, record in enumerate(parsed[1:]): | |
records[index+1] = {} | |
for column, value in zip(columns, record): | |
records[index+1][column] = value | |
if column in indexes: | |
if not value in indexes[column]: | |
indexes[column][value] = index+1 | |
records[index+1]['id'] = index+1 | |
#print(records) | |
#return | |
for id, record in records.items(): | |
nid = None | |
for field in fields: | |
if type(field) is list: | |
for sfield in field: | |
if record[sfield] in indexes[sfield]: | |
nid = indexes[sfield][record[sfield]] | |
else: | |
if record[field] in indexes[field]: | |
nid = indexes[field][record[field]] | |
if nid is not None and id != nid: | |
print(nid) | |
records[id]['id'] = nid | |
result = [(','.join(['ID'] + columns))] | |
for record in records.values(): | |
result.append((','.join([str(record['id'])] + [record[c] for c in columns]))) | |
return result | |
# FirstName,LastName,Phone1,Phone2,Email1,Email2,Zip | |
def test_same_email(): | |
print('\n'.join(match('input1.csv', ['Email']))) | |
print('\n'.join(match('input2.csv', [['Email1', 'Email2']]))) | |
print('\n'.join(match('input3.csv', [['Email1', 'Email2']]))) | |
def test_same_phone(): | |
print('\n'.join(match('input1.csv', ['Phone']))) | |
print('\n'.join(match('input2.csv', [['Phone1', 'Phone2']]))) | |
print('\n'.join(match('input3.csv', [['Phone1', 'Phone2']]))) | |
def test_same_phone_and_email(): | |
print('\n'.join(match('input1.csv', ['Phone', 'Email']))) | |
print('\n'.join(match('input2.csv', [['Phone1', 'Phone2'], ['Email1', 'Email2']]))) | |
print('\n'.join(match('input3.csv', [['Phone1', 'Phone2'], ['Email1', 'Email2']]))) | |
if __name__ == '__main__': | |
_globals = globals().keys() | |
for func in _globals: | |
v = globals()[func] | |
if callable(v) and func.startswith('test'): | |
v() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment