Skip to content

Instantly share code, notes, and snippets.

@oneryalcin
Last active August 6, 2019 11:27
Show Gist options
  • Save oneryalcin/c9c4fd75ad0b169e50a70f7e00fe05e3 to your computer and use it in GitHub Desktop.
Save oneryalcin/c9c4fd75ad0b169e50a70f7e00fe05e3 to your computer and use it in GitHub Desktop.
Example NLTK (Corporate messaging - Udacity example)
import nltk
nltk.download(['punkt', 'wordnet'])
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
def load_data():
# Data comes from figure8's collection, (https://www.figure-eight.com/data-for-everyone/) CORPORATE MESSAGING
data = "https://d1p17r2m4rzlbo.cloudfront.net/wp-content/uploads/2016/03/Corporate-messaging-DFE.csv"
df = pd.read_csv(data, encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
return X, y
def tokenize(text):
detected_urls = re.findall(url_regex, text)
for url in detected_urls:
text = text.replace(url, "urlplaceholder")
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
def display_results(y_test, y_pred):
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()
print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
def main():
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
clf = RandomForestClassifier()
# train classifier
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)
# predict on test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)
y_pred = clf.predict(X_test_tfidf)
# display results
display_results(y_test, y_pred)
main()
@oneryalcin
Copy link
Author

Example result:

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
[[ 87   0  27]
 [  1  34   5]
 [  7   1 439]]
Accuracy: 0.931780366057

@oneryalcin
Copy link
Author

oneryalcin commented Aug 6, 2019

Note that main() method can be further simplified by using sklearn's Pipeline class

from sklearn.pipeline import Pipeline

def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # build pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier())
    ])

      
    # train classifier
    pipeline.fit(X_train, y_train)
    
    # predict on test data
    y_pred = pipeline.predict(X_test)

    # display results
    display_results(y_test, y_pred)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment