oneryalcin/corporate_messaging.py

## corporate_messaging.py
import nltk
nltk.download(['punkt', 'wordnet'])

import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


def load_data():
    # Data comes from figure8's collection, (https://www.figure-eight.com/data-for-everyone/) CORPORATE MESSAGING
    data = "https://d1p17r2m4rzlbo.cloudfront.net/wp-content/uploads/2016/03/Corporate-messaging-DFE.csv"
    df = pd.read_csv(data, encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)


def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # train classifier
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)
    clf.fit(X_train_tfidf, y_train)

    # predict on test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)


main()
	import nltk
	nltk.download(['punkt', 'wordnet'])

	import re
	import numpy as np
	import pandas as pd
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from sklearn.metrics import confusion_matrix
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

	url_regex = 'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


	def load_data():
	# Data comes from figure8's collection, (https://www.figure-eight.com/data-for-everyone/) CORPORATE MESSAGING
	data = "https://d1p17r2m4rzlbo.cloudfront.net/wp-content/uploads/2016/03/Corporate-messaging-DFE.csv"
	df = pd.read_csv(data, encoding='latin-1')
	df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
	X = df.text.values
	y = df.category.values
	return X, y


	def tokenize(text):
	detected_urls = re.findall(url_regex, text)
	for url in detected_urls:
	text = text.replace(url, "urlplaceholder")

	tokens = word_tokenize(text)
	lemmatizer = WordNetLemmatizer()

	clean_tokens = []
	for tok in tokens:
	clean_tok = lemmatizer.lemmatize(tok).lower().strip()
	clean_tokens.append(clean_tok)

	return clean_tokens


	def display_results(y_test, y_pred):
	labels = np.unique(y_pred)
	confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
	accuracy = (y_pred == y_test).mean()

	print("Labels:", labels)
	print("Confusion Matrix:\n", confusion_mat)
	print("Accuracy:", accuracy)


	def main():
	X, y = load_data()
	X_train, X_test, y_train, y_test = train_test_split(X, y)

	vect = CountVectorizer(tokenizer=tokenize)
	tfidf = TfidfTransformer()
	clf = RandomForestClassifier()

	# train classifier
	X_train_counts = vect.fit_transform(X_train)
	X_train_tfidf = tfidf.fit_transform(X_train_counts)
	clf.fit(X_train_tfidf, y_train)

	# predict on test data
	X_test_counts = vect.transform(X_test)
	X_test_tfidf = tfidf.transform(X_test_counts)
	y_pred = clf.predict(X_test_tfidf)

	# display results
	display_results(y_test, y_pred)


	main()