Liev lievcin

## Makefile
SHELL:=/bin/bash
PROJECT=project
VERSION=3.7.4
VENV=${PROJECT}-${VERSION}
VENV_DIR=$(shell pyenv root)/versions/${VENV}
PYTHON=${VENV_DIR}/bin/python
JUPYTER_ENV_NAME=${VENV}
JUPYTER_PORT=8888

## Make sure you have `pyenv` and `pyenv-virtualenv` installed beforehand

## tfidf.py

pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[
            ('review_text', Pipeline([
                ('selector', ItemSelector(key='review_text')),
                ('count_dict', CountVectorizer()),
            ])),
            ('rating', Pipeline([

## first_fit.py
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)
pred = nb_classifier.predict(count_test)

score = metrics.accuracy_score(y_test, pred)
print('Classifier accuracy: ' + str(round(100*score, 2)) + '%')

## setting_up.py
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['review_text'], data['sentiment'], test_size=0.33, random_state=1)

# Initialize a CountVectorizer and Tfidf objects
count_vectorizer = CountVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

## sentiment.py
def get_sent(rating):
    if rating < 3:
        sent = 'negative'
    elif rating > 3:
        sent = 'positive'
    else:
        sent = 'mweh'
    return sent

data['sentiment'] = data.apply(lambda row: get_sent(row['rating']), axis=1)

## fk_test.py
#Looking for the Flesch-Kincaid Readability measure across our pipulation
from nltk.tokenize import sent_tokenize, word_tokenize
import pyphen
dic = pyphen.Pyphen(lang='en')

data['word_count'] = data.apply(lambda row: re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", row['review_text']), axis=1)
data['word_count'] = data.apply(lambda row: re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", row['word_count']), axis=1)
data['word_count'] = data.apply(lambda row: re.sub(r"<[^>]*>", "", row['word_count']), axis=1)
data['word_count'] = data.apply(lambda row: len(word_tokenize(row['word_count'])), axis=1)
data['sent_count'] = data.apply(lambda row: len(sent_tokenize(row['review_text'])), axis=1)

## parseReview3.py
def parse_label(label):
    if label == '__label2__':
        return 'real'
    else:
        return 'fake'

def parse_verification(label):
    if label == 'N':
        return 0
    else:

## toFeatureVector3.py
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens, rating, verified):
    # Should return a dictionary containing features as keys, and weights as values
    v = {}
    for t in tokens:
        try:
            featureDict[t] += 1
        except KeyError:
            featureDict[t] = 1

## toFeatureVector2.py
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    v = {}
    for t in tokens:
        try:
            featureDict[t] += 1
        except KeyError:
            featureDict[t] = 1

## crossValidate2.py
def crossValidate(dataset, folds):
    shuffle(dataset)
    predictions = []
    ground_truth = []
    foldSize = int(len(dataset)/folds)
    #preProcess and tokenize once!
    dataset = [(t[0], toFeatureVector(preProcess(t[1])), t[2]) for t in dataset]

    for i in range(0,len(dataset), foldSize):
        trainFolds = dataset[:i] + dataset[i+foldSize:]
	SHELL:=/bin/bash
	PROJECT=project
	VERSION=3.7.4
	VENV=${PROJECT}-${VERSION}
	VENV_DIR=$(shell pyenv root)/versions/${VENV}
	PYTHON=${VENV_DIR}/bin/python
	JUPYTER_ENV_NAME=${VENV}
	JUPYTER_PORT=8888

	## Make sure you have `pyenv` and `pyenv-virtualenv` installed beforehand

	pipeline = Pipeline([
	# Use FeatureUnion to combine the features from subject and body
	('union', FeatureUnion(
	transformer_list=[
	('review_text', Pipeline([
	('selector', ItemSelector(key='review_text')),
	('count_dict', CountVectorizer()),
	])),
	('rating', Pipeline([
	from sklearn import metrics
	from sklearn.naive_bayes import MultinomialNB

	nb_classifier = MultinomialNB()
	nb_classifier.fit(count_train, y_train)
	pred = nb_classifier.predict(count_test)

	score = metrics.accuracy_score(y_test, pred)
	print('Classifier accuracy: ' + str(round(100*score, 2)) + '%')
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(data['review_text'], data['sentiment'], test_size=0.33, random_state=1)

	# Initialize a CountVectorizer and Tfidf objects
	count_vectorizer = CountVectorizer(stop_words='english')
	tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
	def get_sent(rating):
	if rating < 3:
	sent = 'negative'
	elif rating > 3:
	sent = 'positive'
	else:
	sent = 'mweh'
	return sent

	data['sentiment'] = data.apply(lambda row: get_sent(row['rating']), axis=1)
	#Looking for the Flesch-Kincaid Readability measure across our pipulation
	from nltk.tokenize import sent_tokenize, word_tokenize
	import pyphen
	dic = pyphen.Pyphen(lang='en')

	data['word_count'] = data.apply(lambda row: re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", row['review_text']), axis=1)
	data['word_count'] = data.apply(lambda row: re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", row['word_count']), axis=1)
	data['word_count'] = data.apply(lambda row: re.sub(r"<[^>]*>", "", row['word_count']), axis=1)
	data['word_count'] = data.apply(lambda row: len(word_tokenize(row['word_count'])), axis=1)
	data['sent_count'] = data.apply(lambda row: len(sent_tokenize(row['review_text'])), axis=1)
	def parse_label(label):
	if label == '__label2__':
	return 'real'
	else:
	return 'fake'

	def parse_verification(label):
	if label == 'N':
	return 0
	else:
	featureDict = {} # A global dictionary of features

	def toFeatureVector(tokens, rating, verified):
	# Should return a dictionary containing features as keys, and weights as values
	v = {}
	for t in tokens:
	try:
	featureDict[t] += 1
	except KeyError:
	featureDict[t] = 1
	featureDict = {} # A global dictionary of features

	def toFeatureVector(tokens):
	# Should return a dictionary containing features as keys, and weights as values
	v = {}
	for t in tokens:
	try:
	featureDict[t] += 1
	except KeyError:
	featureDict[t] = 1
	def crossValidate(dataset, folds):
	shuffle(dataset)
	predictions = []
	ground_truth = []
	foldSize = int(len(dataset)/folds)
	#preProcess and tokenize once!
	dataset = [(t[0], toFeatureVector(preProcess(t[1])), t[2]) for t in dataset]

	for i in range(0,len(dataset), foldSize):
	trainFolds = dataset[:i] + dataset[i+foldSize:]