Skip to content

Instantly share code, notes, and snippets.

@pemagrg1
Last active September 4, 2019 11:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pemagrg1/f5bf72480e92ead779f6fa64203608bd to your computer and use it in GitHub Desktop.
Save pemagrg1/f5bf72480e92ead779f6fa64203608bd to your computer and use it in GitHub Desktop.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
from sklearn import linear_model
Project_path = "<path to the project folder>"
data = pd.read_csv('https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv')
print (data.category.unique())
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
decode_error='ignore')
def train_bpsd(df, vectorizer):
tfidf = vectorizer.fit(df["text"].values.astype('U'))
X = vectorizer.fit_transform(df["text"].values.astype('U'))
y = df['category']
train_test(X, y)
model = svm.LinearSVC()
print("==fitting the model===")
model.fit(X.A, y)
print("==fit done\=")
return model, tfidf
model_path = Project_path + "/08. Multi-class_text_classification/models/model.pickle"
vectorizer_path = Project_path + "/08. Multi-class_text_classification/models/vectorizer.pickle"
model, vectorizer_model = train_bpsd(data, vectorizer)
pickle.dump(model, open(model_path, 'wb'))
pickle.dump(vectorizer_model, open(vectorizer_path, "wb"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment