ken333135

## NaiveBayes Multi-Label Classifier
#Define a pipeline combining a text feature extractor with multi label classifer
NB_pipeline = Pipeline([
                        ('tfidf',TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True,class_prior=None))),
                        ])

#Train the model and get the prediction
score = []
for code in label:
    # train the model

## GradientBoostingClassifier
#Create pipeline for GradientBoostingClassifier
params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
GBC_pipeline = Pipeline([
                        ('tfidf',TfidfVectorizer()),
                        ('clf',OneVsRestClassifier(ensemble.GradientBoostingClassifier(**params),n_jobs=1)),
])

## TopWords 1
#creates an instance of the TfidfVectorizer
vect = TfidfVectorizer()
#fits the Tfidfvectorizer to your corpus, creating the vocabulary
vect.fit_transform(data_df['Your_freetext_column'])

#shows the fitted TfidfVectorizer and your corpus vocabulary
len(vect.vocabulary_)
vect.vocabulary_

## TopWords 2
#Gets the Tfidf Score of each document in the corpus, and formats it into a Pandas Dataframe
vect_data = pd.DataFrame(list(vect_uncommon.transform(uncommon_df['Fault Desc_Rect']).toarray()))
#Appends a new column containing the label of your data to the DataFrame
vect_data['Fault Code'] = list(data_df['Your_Label'])

vect_data.shape
vect_data

## TopWords 3
def find_top_words(class,n):
    ind = np.argpartition(list(vect_data.loc[class]),-n)[-n:]
    top_words=[]
    for index in ind:
        top_words.append([list(vect.vocabulary_.keys())[list(vect.vocabulary_.values()).index(index)],
                         list(vect_data.loc[code])[index]])
    return top_words

#test the function on class 'ACON'. Grab the top 6 words
find_top_words('ACON',6)

## TopWords 4
top_words = []
#loop to find top 5 words of each class in the dataset
for code in vect_data.index:
    top_words.append([code,find_top_words(code,5)])

#print the list of top words
top_words

## DrawGraphDash 1
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Network Graph of '+str(num_nodes)+' rules',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    showarrow=False,
                    xref="paper", yref="paper",

## DrawGraphDash 2
app.layout = html.Div([
                html.Div(dcc.Graph(id='Graph',figure=fig)),
                html.Div(className='row', children=[
                    html.Div([html.H2('Overall Data'),
                              html.P('Num of nodes: ' + str(len(G.nodes))),
                              html.P('Num of edges: ' + str(len(G.edges)))],
                              className='three columns'),
                    html.Div([
                            html.H2('Selected Data'),
                            html.Div(id='selected-data'),

## genSankey
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp


## Create a Sankey Diagram
import pandas as pd
import plotly
import plotly.plotly as py

fig = genSankey(df,cat_cols=['lvl1','lvl2','lvl3','lvl4'],value_cols='count',title='Word Etymology')
plotly.offline.plot(fig, validate=False)
	#Define a pipeline combining a text feature extractor with multi label classifer
	NB_pipeline = Pipeline([
	('tfidf',TfidfVectorizer()),
	('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True,class_prior=None))),
	])

	#Train the model and get the prediction
	score = []
	for code in label:
	# train the model
	#Create pipeline for GradientBoostingClassifier
	params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
	'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
	GBC_pipeline = Pipeline([
	('tfidf',TfidfVectorizer()),
	('clf',OneVsRestClassifier(ensemble.GradientBoostingClassifier(**params),n_jobs=1)),
	])
	#creates an instance of the TfidfVectorizer
	vect = TfidfVectorizer()
	#fits the Tfidfvectorizer to your corpus, creating the vocabulary
	vect.fit_transform(data_df['Your_freetext_column'])

	#shows the fitted TfidfVectorizer and your corpus vocabulary
	len(vect.vocabulary_)
	vect.vocabulary_
	#Gets the Tfidf Score of each document in the corpus, and formats it into a Pandas Dataframe
	vect_data = pd.DataFrame(list(vect_uncommon.transform(uncommon_df['Fault Desc_Rect']).toarray()))
	#Appends a new column containing the label of your data to the DataFrame
	vect_data['Fault Code'] = list(data_df['Your_Label'])

	vect_data.shape
	vect_data
	def find_top_words(class,n):
	ind = np.argpartition(list(vect_data.loc[class]),-n)[-n:]
	top_words=[]
	for index in ind:
	top_words.append([list(vect.vocabulary_.keys())[list(vect.vocabulary_.values()).index(index)],
	list(vect_data.loc[code])[index]])
	return top_words

	#test the function on class 'ACON'. Grab the top 6 words
	find_top_words('ACON',6)
	top_words = []
	#loop to find top 5 words of each class in the dataset
	for code in vect_data.index:
	top_words.append([code,find_top_words(code,5)])

	#print the list of top words
	top_words
	fig = go.Figure(data=[edge_trace, node_trace],
	layout=go.Layout(
	title='<br>Network Graph of '+str(num_nodes)+' rules',
	titlefont=dict(size=16),
	showlegend=False,
	hovermode='closest',
	margin=dict(b=20,l=5,r=5,t=40),
	annotations=[ dict(
	showarrow=False,
	xref="paper", yref="paper",
	app.layout = html.Div([
	html.Div(dcc.Graph(id='Graph',figure=fig)),
	html.Div(className='row', children=[
	html.Div([html.H2('Overall Data'),
	html.P('Num of nodes: ' + str(len(G.nodes))),
	html.P('Num of edges: ' + str(len(G.edges)))],
	className='three columns'),
	html.Div([
	html.H2('Selected Data'),
	html.Div(id='selected-data'),
	def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
	# maximum of 6 value cols -> 6 colors
	colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
	labelList = []
	colorNumList = []
	for catCol in cat_cols:
	labelListTemp = list(set(df[catCol].values))
	colorNumList.append(len(labelListTemp))
	labelList = labelList + labelListTemp
	import pandas as pd
	import plotly
	import plotly.plotly as py

	fig = genSankey(df,cat_cols=['lvl1','lvl2','lvl3','lvl4'],value_cols='count',title='Word Etymology')
	plotly.offline.plot(fig, validate=False)