This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SHELL:=/bin/bash | |
PROJECT=project | |
VERSION=3.7.4 | |
VENV=${PROJECT}-${VERSION} | |
VENV_DIR=$(shell pyenv root)/versions/${VENV} | |
PYTHON=${VENV_DIR}/bin/python | |
JUPYTER_ENV_NAME=${VENV} | |
JUPYTER_PORT=8888 | |
## Make sure you have `pyenv` and `pyenv-virtualenv` installed beforehand |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pipeline = Pipeline([ | |
# Use FeatureUnion to combine the features from subject and body | |
('union', FeatureUnion( | |
transformer_list=[ | |
('review_text', Pipeline([ | |
('selector', ItemSelector(key='review_text')), | |
('count_dict', CountVectorizer()), | |
])), | |
('rating', Pipeline([ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import metrics | |
from sklearn.naive_bayes import MultinomialNB | |
nb_classifier = MultinomialNB() | |
nb_classifier.fit(count_train, y_train) | |
pred = nb_classifier.predict(count_test) | |
score = metrics.accuracy_score(y_test, pred) | |
print('Classifier accuracy: ' + str(round(100*score, 2)) + '%') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(data['review_text'], data['sentiment'], test_size=0.33, random_state=1) | |
# Initialize a CountVectorizer and Tfidf objects | |
count_vectorizer = CountVectorizer(stop_words='english') | |
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_sent(rating): | |
if rating < 3: | |
sent = 'negative' | |
elif rating > 3: | |
sent = 'positive' | |
else: | |
sent = 'mweh' | |
return sent | |
data['sentiment'] = data.apply(lambda row: get_sent(row['rating']), axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Looking for the Flesch-Kincaid Readability measure across our pipulation | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
import pyphen | |
dic = pyphen.Pyphen(lang='en') | |
data['word_count'] = data.apply(lambda row: re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", row['review_text']), axis=1) | |
data['word_count'] = data.apply(lambda row: re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", row['word_count']), axis=1) | |
data['word_count'] = data.apply(lambda row: re.sub(r"<[^>]*>", "", row['word_count']), axis=1) | |
data['word_count'] = data.apply(lambda row: len(word_tokenize(row['word_count'])), axis=1) | |
data['sent_count'] = data.apply(lambda row: len(sent_tokenize(row['review_text'])), axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals | |
import re, string, nltk | |
from nltk.corpus.reader.wordnet import NOUN | |
from nltk.corpus import wordnet | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
stop = set(stopwords.words('english')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
featureDict = {} # A global dictionary of features | |
def toFeatureVector(tokens, rating, verified): | |
# Should return a dictionary containing features as keys, and weights as values | |
v = {} | |
for t in tokens: | |
try: | |
featureDict[t] += 1 | |
except KeyError: | |
featureDict[t] = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse_label(label): | |
if label == '__label2__': | |
return 'real' | |
else: | |
return 'fake' | |
def parse_verification(label): | |
if label == 'N': | |
return 0 | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
featureDict = {} # A global dictionary of features | |
def toFeatureVector(tokens): | |
# Should return a dictionary containing features as keys, and weights as values | |
v = {} | |
for t in tokens: | |
try: | |
featureDict[t] += 1 | |
except KeyError: | |
featureDict[t] = 1 |
NewerOlder