Skip to content

Instantly share code, notes, and snippets.

@cesar
Last active March 22, 2020 19:46
Show Gist options
  • Save cesar/e8604392408a09575c6e850264b40b61 to your computer and use it in GitHub Desktop.
Save cesar/e8604392408a09575c6e850264b40b61 to your computer and use it in GitHub Desktop.
import csv
import os
from difflib import get_close_matches
import shutil
import re
import copy
def sanitize_name(name):
return re.sub(r"(\((\w*|\s)*\)|\.|\s)", '', name).lower()
def normalize_file_names():
"""Blows up if folder exists
"""
if os.path.isdir('tmp'):
pass
else:
os.mkdir('tmp')
original_file_names = os.listdir('fotos_politicos')
normalized_file_names = ['_'.join(i[:-4].split('_')[:-1]).lower() + ".jpg" for i in original_file_names]
for ofn, nfn in zip(original_file_names, normalized_file_names):
of = open('fotos_politicos/{}'.format(ofn), 'rb')
nf = open('tmp/{}'.format(nfn), "wb")
nf.write(of.read())
def get_photo_names_mapping():
photos = os.listdir('tmp')
result = []
for photo in photos:
result.append({
'file_name': photo,
'key': ''.join(photo[:-4].split('_'))
})
return result
def get_photo_name_keys(photo_mapping):
return set([i['key'] for i in photo_mapping])
def get_file_name(photos, key):
for photo in photos:
if photo['key'] == key:
return photo['file_name']
def start():
normalize_file_names()
photo_mappings = get_photo_names_mapping()
photo_keys = get_photo_name_keys(photo_mappings)
with open('politicians.csv', encoding='utf-8') as f:
results = []
politicians = csv.DictReader(f, delimiter=',')
field_names = None
for p in politicians:
if not field_names:
field_names = list(p)
field_names.append('sanitized_name')
field_names.append('file')
original_name = p['first_name'] + p['last_name']
sanitized_name = sanitize_name(original_name)
selected_photo = get_close_matches(sanitized_name, photo_keys, n=1)
if selected_photo:
photo_keys.remove(selected_photo[0])
p['sanitized_name'] = sanitized_name
p['file'] = get_file_name(photo_mappings, selected_photo[0])
results.append(dict(p))
with open('politicians_updated.csv', 'w', encoding='utf-8') as nf:
writer = csv.DictWriter(nf, fieldnames=field_names)
writer.writeheader()
for result in results:
writer.writerow(result)
start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment