Skip to content

Instantly share code, notes, and snippets.

@brusangues
Created July 5, 2021 20:33
Show Gist options
  • Save brusangues/7a02cf2140da763d8c011df65285e5c1 to your computer and use it in GitHub Desktop.
Save brusangues/7a02cf2140da763d8c011df65285e5c1 to your computer and use it in GitHub Desktop.
Python code to search emails in csv, xlsx and json files; split email into name and domain; and apply md5 hash to the name, name in uppercase and name in lowercase; returning them in csv format.
# Python code to search emails in csv, xlsx and json files;
# split email into name and domain;
# and apply md5 hash to the name, name in uppercase and name in lowercase;
# returning them in csv format.
import hashlib
import re
import csv
import glob
import pandas as pd
def csv_from_excel(filename):
# Read and store content
# of an excel file
read_file = pd.read_excel(filename, engine='openpyxl')
# Write the dataframe object
# into csv file
read_file.to_csv (filename+".csv",index = None, sep=";")
EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
def md5(text):
return hashlib.md5(text.encode()).hexdigest()
def process(csv_lines, text):
# All mail
match = re.findall(EMAIL, text )
if match:
for m in match:
m_split = m.split("@")
if len(m_split)>=2:
m_name = m_split[0]
line = [m_name, md5(m_name), m]
csv_lines.append(line)
line = [m_name.lower(), md5(m_name.lower()), m]
csv_lines.append(line)
line = [m_name.upper(), md5(m_name.upper()), m]
csv_lines.append(line)
csv_lines = []
for filename in glob.glob('./data/*.xlsx'):
csv_from_excel(filename)
for filename in glob.glob('./data/*.json')+glob.glob('./data/*.csv'):
print(filename)
try:
with open(filename, encoding = 'utf-8', mode='r') as file:
text = file.read()
process(csv_lines, text)
except:
with open(filename, mode='r') as file:
text = file.read()
process(csv_lines, text)
# Remove dupes
def removeDupes(csv_lines):
csv_lines = sorted(csv_lines, key=lambda x: x[0])
csv_dict = {}
for line in csv_lines:
csv_dict[line[0]] = line[1:]
csv_lines_ = []
for l0,l1 in csv_dict.items():
csv_lines_.append([l0]+l1)
csv_lines = csv_lines_
return sorted(csv_lines, key=lambda x: x[0])
csv_lines = removeDupes(csv_lines)
print(csv_lines[:5])
print(len(csv_lines))
# open the file in the write mode
with open('output.csv', 'w', encoding='utf-8') as f:
# create the csv writer
writer = csv.writer(f, delimiter=';', lineterminator = '\n')
for line in csv_lines:
writer.writerow(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment