Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Last active September 4, 2023 20:48
Show Gist options
  • Save fsndzomga/94f75091527369c59dc69894f1c3cff2 to your computer and use it in GitHub Desktop.
Save fsndzomga/94f75091527369c59dc69894f1c3cff2 to your computer and use it in GitHub Desktop.
Email classification spacy llm
import pandas as pd
from keys import OPENAI_API_KEY
import os
from spacy_llm.util import assemble
import pandas as pd
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
nlp = assemble("email_config.cfg")
data = {
'date': [
'2021-01-01', '2021-02-15', '2021-03-20', '2021-04-18',
'2021-05-05', '2021-06-25', '2021-07-14', '2021-08-09',
'2021-09-30', '2021-10-12'
],
'email_content': [
'I am not happy with the service provided.',
'Please process my reimbursement as soon as possible.',
'How can I get coverage for my upcoming hospitalization?',
'What dental procedures are covered?',
'Is eye care covered in our plan?',
'What is the policy for maternity leave?',
'What coverage do I have for dental and vision?',
'I would like to file a complaint and get reimbursed for my expenses.',
'I am getting hospitalized and also have questions about maternity policy.',
'I am not satisfied with the dental service provided.'
],
'category': [
'complaint', 'reimbursement', 'hospitalization', 'dental',
'vision', 'maternity', 'dental, vision', 'complaint, reimbursement',
'hospitalization, maternity', 'complaint, dental'
]
}
df = pd.DataFrame(data)
# Initialize an empty list to hold the new column values
predicted_category = []
is_correct = []
# Iterate over the DataFrame rows
for index, row in df.iterrows():
doc = nlp(row["email_content"])
predicted = max(doc.cats, key=doc.cats.get) # Assuming max is fine, this may change based on your pipeline
predicted_set = set(predicted.split(", "))
actual_set = set(row["category"].split(", "))
# Check if all predicted categories are in the actual categories
is_all_present = predicted_set.issubset(actual_set)
predicted_category.append(predicted)
is_correct.append(is_all_present)
# Add the new columns to the DataFrame
df['predicted_category'] = predicted_category
df['is_correct'] = is_correct
# Count the number of correct predictions
correct_predictions = df['is_correct'].sum()
# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / len(df)) * 100
print(f"The accuracy percentage of the predicted categories is {accuracy_percentage}%")
# Save DataFrame to CSV (optional)
df.to_csv('email_data_predicted.csv', index=False)
print(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment