Skip to content

Instantly share code, notes, and snippets.

import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# setting up path to the data file
PATH = os.path.dirname(os.path.realpath(__file__))
PATH = os.path.join(PATH, 'data')
print(PATH)
@wtberry
wtberry / NameClassifier_data_load.ipynb
Created June 23, 2019 21:53
medium/NameClassifier/dataload
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from sklearn.preprocessing import OrdinaryEncoder
# creating mapping from unique label texts to unique integers
# note this can be re-used to encode and decode the labels after as well
encoder = OrdinaryEncoder().fit(df['code'])
# using the encoder to encode the entire dataset
y = encoder.transform(encoder)
from sklearn.feature_extraction.text import CountVectorizer
# Initialize and fit CountVectorizer with given text documents
vectorizer = CountVectorizer().fit(df['name'])
# use the vectorizer to transform the document into word count vectors (Sparse)
word_mat = vectorizer.transform(df['name'])
from sklearn.naive_bayes import MultinomialNB
# instantiate the model as clf(classifier) and train it
clf = MultinomialNB()
clf.fit(x_train, y_train)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(word_mat, y, test_size=0.3)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(word_mat, y, test_size=0.3)
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.