Last active
January 3, 2023 10:24
-
-
Save ehzawad/7e577c7f4f06dc5579bc28800d1dfd29 to your computer and use it in GitHub Desktop.
Clean some weird encrypted data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import base64 | |
unclean_data = pd.read_csv('/home/ehz/Downloads/email_date_dec22_Blank.csv') | |
# print(unclean_data) | |
# print(unclean_data.columns.tolist()) | |
# print(unclean_data['Subject']) | |
# super_clean_data = unclean_data[[' Subject',' Customer-Email']].dropna(axis=0, how='any') | |
super_clean_data = unclean_data[' Customer-Email'].dropna(axis=0, how='any') | |
# print(super_clean_data) | |
# super clean data to text | |
# super_clean_data.to_csv(r'/home/ehz/Downloads/super_clean_data.txt', header=None, index = False, sep='\t', mode='a') | |
super_clean_data.to_csv(r'/home/ehz/Downloads/super_clean_data.txt', header=None, index = False, mode='a') | |
f = open('/home/ehz/Downloads/super_clean_data.txt', 'r') | |
# print(f.readlines()) | |
# count = 0 | |
# | |
# for line in f: | |
# count += 1 | |
# print(base64.b64decode(line)) | |
# print("{}{}".format(count, line.strip())) | |
from bs4 import BeautifulSoup | |
def isascii(s): | |
"""Check if the characters in string s are in ASCII, U+0-U+7F.""" | |
return len(s) == len(s.encode()) | |
for line in f: | |
html = line + "==========" | |
# print(base64.b64decode(html)) | |
html = base64.b64decode(html) | |
parsed_html = BeautifulSoup(html, 'html.parser') | |
# print(parsed_html) | |
# print html from the parsed html and body and from the body the div container | |
if parsed_html.body is not None: | |
if parsed_html.body.find('div', attrs={'dir':'auto'}) is not None: | |
final_user_text = parsed_html.body.find('div', attrs={'dir':'auto'}).text | |
if len(final_user_text) > 2: | |
final = final_user_text | |
file_bn = open('/home/ehz/Downloads/super_clean_data_bn.txt', 'a') | |
file_en = open('/home/ehz/Downloads/super_clean_data_en.txt', 'a') | |
if isascii(final): | |
file_en.writelines(final + '\n') | |
print(final) | |
print('en') | |
else: | |
print(final) | |
file_bn.writelines(final + '\n') | |
print('bn') | |
Author
ehzawad
commented
Jan 3, 2023
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment