Last active
September 4, 2023 20:48
-
-
Save fsndzomga/94f75091527369c59dc69894f1c3cff2 to your computer and use it in GitHub Desktop.
Email classification spacy llm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from keys import OPENAI_API_KEY | |
import os | |
from spacy_llm.util import assemble | |
import pandas as pd | |
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY | |
nlp = assemble("email_config.cfg") | |
data = { | |
'date': [ | |
'2021-01-01', '2021-02-15', '2021-03-20', '2021-04-18', | |
'2021-05-05', '2021-06-25', '2021-07-14', '2021-08-09', | |
'2021-09-30', '2021-10-12' | |
], | |
'email_content': [ | |
'I am not happy with the service provided.', | |
'Please process my reimbursement as soon as possible.', | |
'How can I get coverage for my upcoming hospitalization?', | |
'What dental procedures are covered?', | |
'Is eye care covered in our plan?', | |
'What is the policy for maternity leave?', | |
'What coverage do I have for dental and vision?', | |
'I would like to file a complaint and get reimbursed for my expenses.', | |
'I am getting hospitalized and also have questions about maternity policy.', | |
'I am not satisfied with the dental service provided.' | |
], | |
'category': [ | |
'complaint', 'reimbursement', 'hospitalization', 'dental', | |
'vision', 'maternity', 'dental, vision', 'complaint, reimbursement', | |
'hospitalization, maternity', 'complaint, dental' | |
] | |
} | |
df = pd.DataFrame(data) | |
# Initialize an empty list to hold the new column values | |
predicted_category = [] | |
is_correct = [] | |
# Iterate over the DataFrame rows | |
for index, row in df.iterrows(): | |
doc = nlp(row["email_content"]) | |
predicted = max(doc.cats, key=doc.cats.get) # Assuming max is fine, this may change based on your pipeline | |
predicted_set = set(predicted.split(", ")) | |
actual_set = set(row["category"].split(", ")) | |
# Check if all predicted categories are in the actual categories | |
is_all_present = predicted_set.issubset(actual_set) | |
predicted_category.append(predicted) | |
is_correct.append(is_all_present) | |
# Add the new columns to the DataFrame | |
df['predicted_category'] = predicted_category | |
df['is_correct'] = is_correct | |
# Count the number of correct predictions | |
correct_predictions = df['is_correct'].sum() | |
# Calculate the accuracy percentage | |
accuracy_percentage = (correct_predictions / len(df)) * 100 | |
print(f"The accuracy percentage of the predicted categories is {accuracy_percentage}%") | |
# Save DataFrame to CSV (optional) | |
df.to_csv('email_data_predicted.csv', index=False) | |
print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment