brusangues/apply-md5.py

## apply-md5.py
# Python code to search emails in csv, xlsx and json files;
# split email into name and domain;
# and apply md5 hash to the name, name in uppercase and name in lowercase;
# returning them in csv format.

import hashlib
import re
import csv
import glob
import pandas as pd

def csv_from_excel(filename):
    # Read and store content
    # of an excel file
    read_file = pd.read_excel(filename, engine='openpyxl')

    # Write the dataframe object
    # into csv file
    read_file.to_csv (filename+".csv",index = None, sep=";")


EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

def md5(text):
    return hashlib.md5(text.encode()).hexdigest()

def process(csv_lines, text):
    # All mail
    match = re.findall(EMAIL, text )
    if match:
        for m in match:
            m_split = m.split("@")
            if len(m_split)>=2:
                m_name = m_split[0]

                line = [m_name, md5(m_name), m]
                csv_lines.append(line)

                line = [m_name.lower(), md5(m_name.lower()), m]
                csv_lines.append(line)

                line = [m_name.upper(), md5(m_name.upper()), m]
                csv_lines.append(line)

csv_lines = []

for filename in glob.glob('./data/*.xlsx'):
    csv_from_excel(filename)

for filename in glob.glob('./data/*.json')+glob.glob('./data/*.csv'):
    print(filename)
    try:
        with open(filename, encoding = 'utf-8', mode='r') as file:
            text = file.read()
            process(csv_lines, text)
    except:
        with open(filename, mode='r') as file:
            text = file.read()
            process(csv_lines, text)

# Remove dupes
def removeDupes(csv_lines):
    csv_lines = sorted(csv_lines, key=lambda x: x[0])
    csv_dict = {}
    for line in csv_lines:
        csv_dict[line[0]] = line[1:]
    csv_lines_ = []
    for l0,l1 in csv_dict.items():
        csv_lines_.append([l0]+l1)
    csv_lines = csv_lines_
    return sorted(csv_lines, key=lambda x: x[0])

csv_lines = removeDupes(csv_lines)
print(csv_lines[:5])
print(len(csv_lines))

# open the file in the write mode
with open('output.csv', 'w', encoding='utf-8') as f:
    # create the csv writer
    writer = csv.writer(f, delimiter=';', lineterminator = '\n')
    for line in csv_lines:
        writer.writerow(line)
	# Python code to search emails in csv, xlsx and json files;
	# split email into name and domain;
	# and apply md5 hash to the name, name in uppercase and name in lowercase;
	# returning them in csv format.

	import hashlib
	import re
	import csv
	import glob
	import pandas as pd

	def csv_from_excel(filename):
	# Read and store content
	# of an excel file
	read_file = pd.read_excel(filename, engine='openpyxl')

	# Write the dataframe object
	# into csv file
	read_file.to_csv (filename+".csv",index = None, sep=";")


	EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'

	def md5(text):
	return hashlib.md5(text.encode()).hexdigest()

	def process(csv_lines, text):
	# All mail
	match = re.findall(EMAIL, text )
	if match:
	for m in match:
	m_split = m.split("@")
	if len(m_split)>=2:
	m_name = m_split[0]

	line = [m_name, md5(m_name), m]
	csv_lines.append(line)

	line = [m_name.lower(), md5(m_name.lower()), m]
	csv_lines.append(line)

	line = [m_name.upper(), md5(m_name.upper()), m]
	csv_lines.append(line)

	csv_lines = []

	for filename in glob.glob('./data/*.xlsx'):
	csv_from_excel(filename)

	for filename in glob.glob('./data/.json')+glob.glob('./data/.csv'):
	print(filename)
	try:
	with open(filename, encoding = 'utf-8', mode='r') as file:
	text = file.read()
	process(csv_lines, text)
	except:
	with open(filename, mode='r') as file:
	text = file.read()
	process(csv_lines, text)

	# Remove dupes
	def removeDupes(csv_lines):
	csv_lines = sorted(csv_lines, key=lambda x: x[0])
	csv_dict = {}
	for line in csv_lines:
	csv_dict[line[0]] = line[1:]
	csv_lines_ = []
	for l0,l1 in csv_dict.items():
	csv_lines_.append([l0]+l1)
	csv_lines = csv_lines_
	return sorted(csv_lines, key=lambda x: x[0])

	csv_lines = removeDupes(csv_lines)
	print(csv_lines[:5])
	print(len(csv_lines))

	# open the file in the write mode
	with open('output.csv', 'w', encoding='utf-8') as f:
	# create the csv writer
	writer = csv.writer(f, delimiter=';', lineterminator = '\n')
	for line in csv_lines:
	writer.writerow(line)