Last active
January 30, 2023 17:19
-
-
Save wjkennedy/b3d13ae777ffeb5444070d84044ef0ee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.multioutput import MultiOutputClassifier | |
from sklearn.svm import LinearSVC | |
# Load data into a pandas DataFrame | |
df = pd.read_csv('jira_data.csv') | |
# Encoding categorical variables | |
encoder = LabelEncoder() | |
df['component'] = encoder.fit_transform(df['component']) | |
df['custom_field_1'] = encoder.fit_transform(df['custom_field_1']) | |
df['custom_field_2'] = encoder.fit_transform(df['custom_field_2']) | |
# Concatenate text columns | |
df['text'] = df['summary'] + df['description'] + df['comment'] | |
# Vectorize text data using Tf-idf | |
vectorizer = TfidfVectorizer() | |
text_vector = vectorizer.fit_transform(df['text']) | |
# Split data into training and testing sets | |
train_data = text_vector[:800, :] | |
train_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[:800, :] | |
test_data = text_vector[800:, :] | |
test_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[800:, :] | |
# Train a multi-label classifier using Linear SVM | |
clf = MultiOutputClassifier(LinearSVC()) | |
clf.fit(train_data, train_labels) | |
# Evaluate the classifier on the test set | |
accuracy = clf.score(test_data, test_labels) | |
print('Accuracy:', accuracy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.multioutput import MultiOutputClassifier | |
# Create X and y variables | |
X = df[['Summary', 'Description', 'Comments']] | |
y = df[['Project Key']] | |
# Split the data into training and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import necessary libraries | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn import metrics | |
# Create X and y variables | |
X = df[['Summary', 'Description', 'Comments']] | |
X = X.values.tolist() | |
X = vectorizer.fit_transform(X) | |
y = df['Project Key'] | |
# Convert text data to numerical using CountVectorizer | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform(X.values.ravel()) | |
# Convert labels to integers using LabelEncoder | |
le = LabelEncoder() | |
y = le.fit_transform(y) | |
# Create classifier | |
clf = RandomForestClassifier() | |
# Fit classifier to data | |
clf.fit(X, y) | |
# Predict on X test | |
y_pred = clf.predict(X) | |
# Print metrics | |
print("Accuracy:",metrics.accuracy_score(y, y_pred)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import necessary libraries | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn import metrics | |
from sklearn.model_selection import train_test_split | |
# Create X and y variables | |
X = df[['Summary', 'Description', 'Comments']] | |
y = df['Project Key'] | |
# Convert text data to numerical using CountVectorizer | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform(X['Summary']+X['Description']+X['Comments']) | |
# Convert labels to integers using LabelEncoder | |
le = LabelEncoder() | |
y = le.fit_transform(y) | |
# Create training and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) | |
# Create classifier | |
clf = RandomForestClassifier() | |
# Fit classifier to training data | |
clf.fit(X_train, y_train) | |
# Predict on test data | |
y_pred = clf.predict(X_test) | |
# Print metrics | |
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Split the dataset into training and test sets | |
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['labels'], test_size=0.2) | |
# Create a pipeline with a TfidfVectorizer and a LogisticRegression | |
pipeline = Pipeline([ | |
('tfidf', TfidfVectorizer()), | |
('clf', LogisticRegression()) | |
]) | |
# Fit the pipeline to the training data | |
pipeline.fit(X_train, y_train) | |
# Make predictions on the test set | |
y_pred = pipeline.predict(X_test) | |
# Print the classification report | |
print(classification_report(y_test, y_pred)) | |
# Save the model to a file | |
import joblib | |
joblib.dump(pipeline, 'model.pkl') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.preprocessing import LabelEncoder | |
# Create an instance of LabelEncoder | |
le = LabelEncoder() | |
# Fit and transform the 'y' variable | |
y = le.fit_transform(y) | |
# Use countvectorizer on 'X' variable | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform(X) | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score | |
# Create X and y variables | |
X = df[['Summary', 'Description', 'Comments']] | |
y = df['Project Key'] | |
# Split the data into training and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
# Create classifier | |
clf = RandomForestClassifier() | |
# Fit classifier to training data | |
clf.fit(X_train, y_train) | |
# Make predictions on test data | |
y_pred = clf.predict(X_test) | |
# Evaluate accuracy of model | |
accuracy = accuracy_score(y_test, y_pred) | |
print("Accuracy:", accuracy) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment