wjkennedy/20230130

## 20230130
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC

# Load data into a pandas DataFrame
df = pd.read_csv('jira_data.csv')

# Encoding categorical variables
encoder = LabelEncoder()
df['component'] = encoder.fit_transform(df['component'])
df['custom_field_1'] = encoder.fit_transform(df['custom_field_1'])
df['custom_field_2'] = encoder.fit_transform(df['custom_field_2'])

# Concatenate text columns
df['text'] = df['summary'] + df['description'] + df['comment']

# Vectorize text data using Tf-idf
vectorizer = TfidfVectorizer()
text_vector = vectorizer.fit_transform(df['text'])

# Split data into training and testing sets
train_data = text_vector[:800, :]
train_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[:800, :]
test_data = text_vector[800:, :]
test_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[800:, :]

# Train a multi-label classifier using Linear SVM
clf = MultiOutputClassifier(LinearSVC())
clf.fit(train_data, train_labels)

# Evaluate the classifier on the test set
accuracy = clf.score(test_data, test_labels)
print('Accuracy:', accuracy)

## gistfile1.txt
from sklearn.multioutput import MultiOutputClassifier

# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
y = df[['Project Key']]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size

## gistfile2.txt
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
X = X.values.tolist()
X = vectorizer.fit_transform(X)

y = df['Project Key']

# Convert text data to numerical using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X.values.ravel())

# Convert labels to integers using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Create classifier
clf = RandomForestClassifier()

# Fit classifier to data
clf.fit(X, y)

# Predict on X test
y_pred = clf.predict(X)

# Print metrics
print("Accuracy:",metrics.accuracy_score(y, y_pred))

## gistfile3.txt
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
y = df['Project Key']

# Convert text data to numerical using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X['Summary']+X['Description']+X['Comments'])

# Convert labels to integers using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create classifier
clf = RandomForestClassifier()

# Fit classifier to training data
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Print metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## gistfile4.txt
 Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['labels'], test_size=0.2)

# Create a pipeline with a TfidfVectorizer and a LogisticRegression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Save the model to a file
import joblib
joblib.dump(pipeline, 'model.pkl')

## labels
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
le = LabelEncoder()

# Fit and transform the 'y' variable
y = le.fit_transform(y)

# Use countvectorizer on 'X' variable
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
y = df['Project Key']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create classifier
clf = RandomForestClassifier()

# Fit classifier to training data
clf.fit(X_train, y_train)

# Make predictions on test data
y_pred = clf.predict(X_test)

# Evaluate accuracy of model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
	import pandas as pd
	from sklearn.preprocessing import LabelEncoder
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.multioutput import MultiOutputClassifier
	from sklearn.svm import LinearSVC

	# Load data into a pandas DataFrame
	df = pd.read_csv('jira_data.csv')

	# Encoding categorical variables
	encoder = LabelEncoder()
	df['component'] = encoder.fit_transform(df['component'])
	df['custom_field_1'] = encoder.fit_transform(df['custom_field_1'])
	df['custom_field_2'] = encoder.fit_transform(df['custom_field_2'])

	# Concatenate text columns
	df['text'] = df['summary'] + df['description'] + df['comment']

	# Vectorize text data using Tf-idf
	vectorizer = TfidfVectorizer()
	text_vector = vectorizer.fit_transform(df['text'])

	# Split data into training and testing sets
	train_data = text_vector[:800, :]
	train_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[:800, :]
	test_data = text_vector[800:, :]
	test_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[800:, :]

	# Train a multi-label classifier using Linear SVM
	clf = MultiOutputClassifier(LinearSVC())
	clf.fit(train_data, train_labels)

	# Evaluate the classifier on the test set
	accuracy = clf.score(test_data, test_labels)
	print('Accuracy:', accuracy)
	from sklearn.multioutput import MultiOutputClassifier

	# Create X and y variables
	X = df[['Summary', 'Description', 'Comments']]
	y = df[['Project Key']]

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
	# Import necessary libraries
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.preprocessing import LabelEncoder
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import metrics

	# Create X and y variables
	X = df[['Summary', 'Description', 'Comments']]
	X = X.values.tolist()
	X = vectorizer.fit_transform(X)

	y = df['Project Key']

	# Convert text data to numerical using CountVectorizer
	vectorizer = CountVectorizer()
	X = vectorizer.fit_transform(X.values.ravel())

	# Convert labels to integers using LabelEncoder
	le = LabelEncoder()
	y = le.fit_transform(y)

	# Create classifier
	clf = RandomForestClassifier()

	# Fit classifier to data
	clf.fit(X, y)

	# Predict on X test
	y_pred = clf.predict(X)

	# Print metrics
	print("Accuracy:",metrics.accuracy_score(y, y_pred))
	Split the dataset into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(data['text'], data['labels'], test_size=0.2)

	# Create a pipeline with a TfidfVectorizer and a LogisticRegression
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer()),
	('clf', LogisticRegression())
	])

	# Fit the pipeline to the training data
	pipeline.fit(X_train, y_train)

	# Make predictions on the test set
	y_pred = pipeline.predict(X_test)

	# Print the classification report
	print(classification_report(y_test, y_pred))

	# Save the model to a file
	import joblib
	joblib.dump(pipeline, 'model.pkl')
	from sklearn.preprocessing import LabelEncoder

	# Create an instance of LabelEncoder
	le = LabelEncoder()

	# Fit and transform the 'y' variable
	y = le.fit_transform(y)

	# Use countvectorizer on 'X' variable
	vectorizer = CountVectorizer()
	X = vectorizer.fit_transform(X)

	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score

	# Create X and y variables
	X = df[['Summary', 'Description', 'Comments']]
	y = df['Project Key']

	# Split the data into training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

	# Create classifier
	clf = RandomForestClassifier()

	# Fit classifier to training data
	clf.fit(X_train, y_train)

	# Make predictions on test data
	y_pred = clf.predict(X_test)

	# Evaluate accuracy of model
	accuracy = accuracy_score(y_test, y_pred)
	print("Accuracy:", accuracy)