Skip to content

Instantly share code, notes, and snippets.

@wjkennedy
Last active January 30, 2023 17:19
Show Gist options
  • Save wjkennedy/b3d13ae777ffeb5444070d84044ef0ee to your computer and use it in GitHub Desktop.
Save wjkennedy/b3d13ae777ffeb5444070d84044ef0ee to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
# Load data into a pandas DataFrame
df = pd.read_csv('jira_data.csv')
# Encoding categorical variables
encoder = LabelEncoder()
df['component'] = encoder.fit_transform(df['component'])
df['custom_field_1'] = encoder.fit_transform(df['custom_field_1'])
df['custom_field_2'] = encoder.fit_transform(df['custom_field_2'])
# Concatenate text columns
df['text'] = df['summary'] + df['description'] + df['comment']
# Vectorize text data using Tf-idf
vectorizer = TfidfVectorizer()
text_vector = vectorizer.fit_transform(df['text'])
# Split data into training and testing sets
train_data = text_vector[:800, :]
train_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[:800, :]
test_data = text_vector[800:, :]
test_labels = df[['component', 'custom_field_1', 'custom_field_2']].iloc[800:, :]
# Train a multi-label classifier using Linear SVM
clf = MultiOutputClassifier(LinearSVC())
clf.fit(train_data, train_labels)
# Evaluate the classifier on the test set
accuracy = clf.score(test_data, test_labels)
print('Accuracy:', accuracy)
from sklearn.multioutput import MultiOutputClassifier
# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
y = df[['Project Key']]
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
X = X.values.tolist()
X = vectorizer.fit_transform(X)
y = df['Project Key']
# Convert text data to numerical using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X.values.ravel())
# Convert labels to integers using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
# Create classifier
clf = RandomForestClassifier()
# Fit classifier to data
clf.fit(X, y)
# Predict on X test
y_pred = clf.predict(X)
# Print metrics
print("Accuracy:",metrics.accuracy_score(y, y_pred))
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
y = df['Project Key']
# Convert text data to numerical using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X['Summary']+X['Description']+X['Comments'])
# Convert labels to integers using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Create classifier
clf = RandomForestClassifier()
# Fit classifier to training data
clf.fit(X_train, y_train)
# Predict on test data
y_pred = clf.predict(X_test)
# Print metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['labels'], test_size=0.2)
# Create a pipeline with a TfidfVectorizer and a LogisticRegression
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LogisticRegression())
])
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)
# Make predictions on the test set
y_pred = pipeline.predict(X_test)
# Print the classification report
print(classification_report(y_test, y_pred))
# Save the model to a file
import joblib
joblib.dump(pipeline, 'model.pkl')
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
le = LabelEncoder()
# Fit and transform the 'y' variable
y = le.fit_transform(y)
# Use countvectorizer on 'X' variable
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Create X and y variables
X = df[['Summary', 'Description', 'Comments']]
y = df['Project Key']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Create classifier
clf = RandomForestClassifier()
# Fit classifier to training data
clf.fit(X_train, y_train)
# Make predictions on test data
y_pred = clf.predict(X_test)
# Evaluate accuracy of model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment