Skip to content

Instantly share code, notes, and snippets.

#Define a pipeline combining a text feature extractor with multi label classifer
NB_pipeline = Pipeline([
('tfidf',TfidfVectorizer()),
('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True,class_prior=None))),
])
#Train the model and get the prediction
score = []
for code in label:
# train the model
#Create pipeline for GradientBoostingClassifier
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
GBC_pipeline = Pipeline([
('tfidf',TfidfVectorizer()),
('clf',OneVsRestClassifier(ensemble.GradientBoostingClassifier(**params),n_jobs=1)),
])
@ken333135
ken333135 / TopWords 1
Created June 19, 2018 02:11
Get Top words for each class (multi-class text classification)
#creates an instance of the TfidfVectorizer
vect = TfidfVectorizer()
#fits the Tfidfvectorizer to your corpus, creating the vocabulary
vect.fit_transform(data_df['Your_freetext_column'])
#shows the fitted TfidfVectorizer and your corpus vocabulary
len(vect.vocabulary_)
vect.vocabulary_
@ken333135
ken333135 / TopWords 2
Created June 19, 2018 02:18
Get Top words for each class (multi-class text classification)
#Gets the Tfidf Score of each document in the corpus, and formats it into a Pandas Dataframe
vect_data = pd.DataFrame(list(vect_uncommon.transform(uncommon_df['Fault Desc_Rect']).toarray()))
#Appends a new column containing the label of your data to the DataFrame
vect_data['Fault Code'] = list(data_df['Your_Label'])
vect_data.shape
vect_data
@ken333135
ken333135 / TopWords 3
Last active June 19, 2018 02:35
Get Top words for each class (multi-class text classification)
def find_top_words(class,n):
ind = np.argpartition(list(vect_data.loc[class]),-n)[-n:]
top_words=[]
for index in ind:
top_words.append([list(vect.vocabulary_.keys())[list(vect.vocabulary_.values()).index(index)],
list(vect_data.loc[code])[index]])
return top_words
#test the function on class 'ACON'. Grab the top 6 words
find_top_words('ACON',6)
@ken333135
ken333135 / TopWords 4
Created June 19, 2018 02:38
Get Top words for each class (multi-class text classification)
top_words = []
#loop to find top 5 words of each class in the dataset
for code in vect_data.index:
top_words.append([code,find_top_words(code,5)])
#print the list of top words
top_words
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='<br>Network Graph of '+str(num_nodes)+' rules',
titlefont=dict(size=16),
showlegend=False,
hovermode='closest',
margin=dict(b=20,l=5,r=5,t=40),
annotations=[ dict(
showarrow=False,
xref="paper", yref="paper",
app.layout = html.Div([
html.Div(dcc.Graph(id='Graph',figure=fig)),
html.Div(className='row', children=[
html.Div([html.H2('Overall Data'),
html.P('Num of nodes: ' + str(len(G.nodes))),
html.P('Num of edges: ' + str(len(G.edges)))],
className='three columns'),
html.Div([
html.H2('Selected Data'),
html.Div(id='selected-data'),
@ken333135
ken333135 / genSankey
Created May 8, 2019 07:57
Wrapper Function to create Sankey Diagram from DataFrame
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
# maximum of 6 value cols -> 6 colors
colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
labelList = []
colorNumList = []
for catCol in cat_cols:
labelListTemp = list(set(df[catCol].values))
colorNumList.append(len(labelListTemp))
labelList = labelList + labelListTemp
@ken333135
ken333135 / Create a Sankey Diagram
Created May 8, 2019 08:08
Script to create a Sankey Diagram using genSankey
import pandas as pd
import plotly
import plotly.plotly as py
fig = genSankey(df,cat_cols=['lvl1','lvl2','lvl3','lvl4'],value_cols='count',title='Word Etymology')
plotly.offline.plot(fig, validate=False)