fsndzomga/email_classification.py

## email_classification.py
import pandas as pd
from keys import OPENAI_API_KEY
import os
from spacy_llm.util import assemble
import pandas as pd

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

nlp = assemble("email_config.cfg")

data = {
    'date': [
        '2021-01-01', '2021-02-15', '2021-03-20', '2021-04-18',
        '2021-05-05', '2021-06-25', '2021-07-14', '2021-08-09',
        '2021-09-30', '2021-10-12'
    ],
    'email_content': [
        'I am not happy with the service provided.',
        'Please process my reimbursement as soon as possible.',
        'How can I get coverage for my upcoming hospitalization?',
        'What dental procedures are covered?',
        'Is eye care covered in our plan?',
        'What is the policy for maternity leave?',
        'What coverage do I have for dental and vision?',
        'I would like to file a complaint and get reimbursed for my expenses.',
        'I am getting hospitalized and also have questions about maternity policy.',
        'I am not satisfied with the dental service provided.'
    ],
    'category': [
        'complaint', 'reimbursement', 'hospitalization', 'dental',
        'vision', 'maternity', 'dental, vision', 'complaint, reimbursement',
        'hospitalization, maternity', 'complaint, dental'
    ]
}

df = pd.DataFrame(data)

# Initialize an empty list to hold the new column values
predicted_category = []
is_correct = []

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    doc = nlp(row["email_content"])
    predicted = max(doc.cats, key=doc.cats.get)  # Assuming max is fine, this may change based on your pipeline
    predicted_set = set(predicted.split(", "))

    actual_set = set(row["category"].split(", "))

    # Check if all predicted categories are in the actual categories
    is_all_present = predicted_set.issubset(actual_set)

    predicted_category.append(predicted)
    is_correct.append(is_all_present)

# Add the new columns to the DataFrame
df['predicted_category'] = predicted_category
df['is_correct'] = is_correct

# Count the number of correct predictions
correct_predictions = df['is_correct'].sum()

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / len(df)) * 100

print(f"The accuracy percentage of the predicted categories is {accuracy_percentage}%")

# Save DataFrame to CSV (optional)
df.to_csv('email_data_predicted.csv', index=False)

print(df)
	import pandas as pd
	from keys import OPENAI_API_KEY
	import os
	from spacy_llm.util import assemble
	import pandas as pd

	os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

	nlp = assemble("email_config.cfg")

	data = {
	'date': [
	'2021-01-01', '2021-02-15', '2021-03-20', '2021-04-18',
	'2021-05-05', '2021-06-25', '2021-07-14', '2021-08-09',
	'2021-09-30', '2021-10-12'
	],
	'email_content': [
	'I am not happy with the service provided.',
	'Please process my reimbursement as soon as possible.',
	'How can I get coverage for my upcoming hospitalization?',
	'What dental procedures are covered?',
	'Is eye care covered in our plan?',
	'What is the policy for maternity leave?',
	'What coverage do I have for dental and vision?',
	'I would like to file a complaint and get reimbursed for my expenses.',
	'I am getting hospitalized and also have questions about maternity policy.',
	'I am not satisfied with the dental service provided.'
	],
	'category': [
	'complaint', 'reimbursement', 'hospitalization', 'dental',
	'vision', 'maternity', 'dental, vision', 'complaint, reimbursement',
	'hospitalization, maternity', 'complaint, dental'
	]
	}

	df = pd.DataFrame(data)

	# Initialize an empty list to hold the new column values
	predicted_category = []
	is_correct = []

	# Iterate over the DataFrame rows
	for index, row in df.iterrows():
	doc = nlp(row["email_content"])
	predicted = max(doc.cats, key=doc.cats.get) # Assuming max is fine, this may change based on your pipeline
	predicted_set = set(predicted.split(", "))

	actual_set = set(row["category"].split(", "))

	# Check if all predicted categories are in the actual categories
	is_all_present = predicted_set.issubset(actual_set)

	predicted_category.append(predicted)
	is_correct.append(is_all_present)

	# Add the new columns to the DataFrame
	df['predicted_category'] = predicted_category
	df['is_correct'] = is_correct

	# Count the number of correct predictions
	correct_predictions = df['is_correct'].sum()

	# Calculate the accuracy percentage
	accuracy_percentage = (correct_predictions / len(df)) * 100

	print(f"The accuracy percentage of the predicted categories is {accuracy_percentage}%")

	# Save DataFrame to CSV (optional)
	df.to_csv('email_data_predicted.csv', index=False)

	print(df)