vankeer/categorize_jira_csv.py

## categorize_jira_csv.py
import os
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY'],
)

# Configure the columns to be considered
columns_to_consider = ["Summary", "Description", "Epic Link Summary", "Parent summary"]

# List of system components for categorization
system_components = ["Web platform", "Mobile app", "API", "Open API", "Libs", "Post-processor", "Unknown"]

def categorize_issue(issue_description):
    """
    Categorizes a JIRA issue using OpenAI's GPT-3.5.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": issue_description + "\n\nCategorize the above JIRA issue into one of the following components. Answer with ONLY ONE item from this list, nothing else:\n" + ", ".join(system_components)},
            ]
        )
        category = response.choices[0].message.content.strip()
        return category if category in system_components else "Unknown"
    except Exception as e:
        print(f"Error during categorization: {e}")
        return "Unknown"

def process_csv(input_file, output_file):
    """
    Processes the JIRA CSV file and categorizes each issue.
    """
    # Read the CSV file
    df = pd.read_csv(input_file, skiprows=0)

    # Display column headers for debugging
    print("Columns found in CSV:", df.columns.tolist())

    # Trim spaces from column headers and make them case-insensitive
    df.columns = [col.strip().title() for col in df.columns]

    # Check if the necessary columns exist and adjust the column names as needed
    actual_columns = {}
    for col in columns_to_consider:
        matched_cols = [c for c in df.columns if c.strip().lower() == col.lower()]
        if not matched_cols:
            raise ValueError(f"Column '{col}' not found in the CSV file")
        actual_columns[col] = matched_cols[0]

    # Add a new column for categorization
    df['Categorization'] = 'Unknown'

    # Process each row
    for index, row in df.iterrows():
        print(f"Processing row {index} out of {len(df)}...")
        issue_description = "\n".join([f"{col}: {row[actual_columns[col]]}" for col in columns_to_consider if actual_columns[col] in df.columns and pd.notna(row[actual_columns[col]])])
        df.at[index, 'Categorization'] = categorize_issue(issue_description)

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file, index=False)

# Example usage
input_csv = 'Jira.csv'
output_csv = 'Jira_output.csv'
process_csv(input_csv, output_csv)
	import os
	import pandas as pd
	from openai import OpenAI

	client = OpenAI(
	api_key=os.environ['OPENAI_API_KEY'],
	)

	# Configure the columns to be considered
	columns_to_consider = ["Summary", "Description", "Epic Link Summary", "Parent summary"]

	# List of system components for categorization
	system_components = ["Web platform", "Mobile app", "API", "Open API", "Libs", "Post-processor", "Unknown"]

	def categorize_issue(issue_description):
	"""
	Categorizes a JIRA issue using OpenAI's GPT-3.5.
	"""
	try:
	response = client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": issue_description + "\n\nCategorize the above JIRA issue into one of the following components. Answer with ONLY ONE item from this list, nothing else:\n" + ", ".join(system_components)},
	]
	)
	category = response.choices[0].message.content.strip()
	return category if category in system_components else "Unknown"
	except Exception as e:
	print(f"Error during categorization: {e}")
	return "Unknown"

	def process_csv(input_file, output_file):
	"""
	Processes the JIRA CSV file and categorizes each issue.
	"""
	# Read the CSV file
	df = pd.read_csv(input_file, skiprows=0)

	# Display column headers for debugging
	print("Columns found in CSV:", df.columns.tolist())

	# Trim spaces from column headers and make them case-insensitive
	df.columns = [col.strip().title() for col in df.columns]

	# Check if the necessary columns exist and adjust the column names as needed
	actual_columns = {}
	for col in columns_to_consider:
	matched_cols = [c for c in df.columns if c.strip().lower() == col.lower()]
	if not matched_cols:
	raise ValueError(f"Column '{col}' not found in the CSV file")
	actual_columns[col] = matched_cols[0]

	# Add a new column for categorization
	df['Categorization'] = 'Unknown'

	# Process each row
	for index, row in df.iterrows():
	print(f"Processing row {index} out of {len(df)}...")
	issue_description = "\n".join([f"{col}: {row[actual_columns[col]]}" for col in columns_to_consider if actual_columns[col] in df.columns and pd.notna(row[actual_columns[col]])])
	df.at[index, 'Categorization'] = categorize_issue(issue_description)

	# Save the updated DataFrame to a new CSV file
	df.to_csv(output_file, index=False)

	# Example usage
	input_csv = 'Jira.csv'
	output_csv = 'Jira_output.csv'
	process_csv(input_csv, output_csv)