Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Last active January 3, 2023 10:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ehzawad/7e577c7f4f06dc5579bc28800d1dfd29 to your computer and use it in GitHub Desktop.
Save ehzawad/7e577c7f4f06dc5579bc28800d1dfd29 to your computer and use it in GitHub Desktop.
Clean some weird encrypted data
import pandas as pd
import base64
unclean_data = pd.read_csv('/home/ehz/Downloads/email_date_dec22_Blank.csv')
# print(unclean_data)
# print(unclean_data.columns.tolist())
# print(unclean_data['Subject'])
# super_clean_data = unclean_data[[' Subject',' Customer-Email']].dropna(axis=0, how='any')
super_clean_data = unclean_data[' Customer-Email'].dropna(axis=0, how='any')
# print(super_clean_data)
# super clean data to text
# super_clean_data.to_csv(r'/home/ehz/Downloads/super_clean_data.txt', header=None, index = False, sep='\t', mode='a')
super_clean_data.to_csv(r'/home/ehz/Downloads/super_clean_data.txt', header=None, index = False, mode='a')
f = open('/home/ehz/Downloads/super_clean_data.txt', 'r')
# print(f.readlines())
# count = 0
#
# for line in f:
# count += 1
# print(base64.b64decode(line))
# print("{}{}".format(count, line.strip()))
from bs4 import BeautifulSoup
def isascii(s):
"""Check if the characters in string s are in ASCII, U+0-U+7F."""
return len(s) == len(s.encode())
for line in f:
html = line + "=========="
# print(base64.b64decode(html))
html = base64.b64decode(html)
parsed_html = BeautifulSoup(html, 'html.parser')
# print(parsed_html)
# print html from the parsed html and body and from the body the div container
if parsed_html.body is not None:
if parsed_html.body.find('div', attrs={'dir':'auto'}) is not None:
final_user_text = parsed_html.body.find('div', attrs={'dir':'auto'}).text
if len(final_user_text) > 2:
final = final_user_text
file_bn = open('/home/ehz/Downloads/super_clean_data_bn.txt', 'a')
file_en = open('/home/ehz/Downloads/super_clean_data_en.txt', 'a')
if isascii(final):
file_en.writelines(final + '\n')
print(final)
print('en')
else:
print(final)
file_bn.writelines(final + '\n')
print('bn')
@ehzawad
Copy link
Author

ehzawad commented Jan 3, 2023

l = open('/home/ehz/Downloads/EMAIL/Blank_email_closed_by_RPA_bn.txt', 'r')


file_o = open('/home/ehz/Downloads/jsonified.json', 'a')
for i in l:
  strr = '{' + '\r\n' + '  ' + '"text": ' + '"' + i + '"'  + '\r' + '}'
  strr = strr  + '\n,'
  # print(strr)
  
  
  file_o.writelines(strr)
  

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment