Skip to content

Instantly share code, notes, and snippets.

@ken333135
ken333135 / TopWords 3
Last active June 19, 2018 02:35
Get Top words for each class (multi-class text classification)
def find_top_words(class,n):
ind = np.argpartition(list(vect_data.loc[class]),-n)[-n:]
top_words=[]
for index in ind:
top_words.append([list(vect.vocabulary_.keys())[list(vect.vocabulary_.values()).index(index)],
list(vect_data.loc[code])[index]])
return top_words
#test the function on class 'ACON'. Grab the top 6 words
find_top_words('ACON',6)
@ken333135
ken333135 / TopWords 2
Created June 19, 2018 02:18
Get Top words for each class (multi-class text classification)
#Gets the Tfidf Score of each document in the corpus, and formats it into a Pandas Dataframe
vect_data = pd.DataFrame(list(vect_uncommon.transform(uncommon_df['Fault Desc_Rect']).toarray()))
#Appends a new column containing the label of your data to the DataFrame
vect_data['Fault Code'] = list(data_df['Your_Label'])
vect_data.shape
vect_data
@ken333135
ken333135 / TopWords 1
Created June 19, 2018 02:11
Get Top words for each class (multi-class text classification)
#creates an instance of the TfidfVectorizer
vect = TfidfVectorizer()
#fits the Tfidfvectorizer to your corpus, creating the vocabulary
vect.fit_transform(data_df['Your_freetext_column'])
#shows the fitted TfidfVectorizer and your corpus vocabulary
len(vect.vocabulary_)
vect.vocabulary_
#Create pipeline for GradientBoostingClassifier
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
GBC_pipeline = Pipeline([
('tfidf',TfidfVectorizer()),
('clf',OneVsRestClassifier(ensemble.GradientBoostingClassifier(**params),n_jobs=1)),
])
#Define a pipeline combining a text feature extractor with multi label classifer
NB_pipeline = Pipeline([
('tfidf',TfidfVectorizer()),
('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True,class_prior=None))),
])
#Train the model and get the prediction
score = []
for code in label:
# train the model