Skip to content

Instantly share code, notes, and snippets.

@lievcin
lievcin / Makefile
Created July 18, 2023 09:59 — forked from genyrosk/Makefile
Makefile for a Python environment with pyenv-virtualenv
SHELL:=/bin/bash
PROJECT=project
VERSION=3.7.4
VENV=${PROJECT}-${VERSION}
VENV_DIR=$(shell pyenv root)/versions/${VENV}
PYTHON=${VENV_DIR}/bin/python
JUPYTER_ENV_NAME=${VENV}
JUPYTER_PORT=8888
## Make sure you have `pyenv` and `pyenv-virtualenv` installed beforehand
@lievcin
lievcin / tfidf.py
Last active February 12, 2018 18:01
pipeline = Pipeline([
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(
transformer_list=[
('review_text', Pipeline([
('selector', ItemSelector(key='review_text')),
('count_dict', CountVectorizer()),
])),
('rating', Pipeline([
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)
pred = nb_classifier.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print('Classifier accuracy: ' + str(round(100*score, 2)) + '%')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['review_text'], data['sentiment'], test_size=0.33, random_state=1)
# Initialize a CountVectorizer and Tfidf objects
count_vectorizer = CountVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
def get_sent(rating):
if rating < 3:
sent = 'negative'
elif rating > 3:
sent = 'positive'
else:
sent = 'mweh'
return sent
data['sentiment'] = data.apply(lambda row: get_sent(row['rating']), axis=1)
#Looking for the Flesch-Kincaid Readability measure across our pipulation
from nltk.tokenize import sent_tokenize, word_tokenize
import pyphen
dic = pyphen.Pyphen(lang='en')
data['word_count'] = data.apply(lambda row: re.sub(r"(\w)([.,;:!-?'\"”\)])", r"\1 \2", row['review_text']), axis=1)
data['word_count'] = data.apply(lambda row: re.sub(r"([.,;:!-?'\"“\(])(\w)", r"\1 \2", row['word_count']), axis=1)
data['word_count'] = data.apply(lambda row: re.sub(r"<[^>]*>", "", row['word_count']), axis=1)
data['word_count'] = data.apply(lambda row: len(word_tokenize(row['word_count'])), axis=1)
data['sent_count'] = data.apply(lambda row: len(sent_tokenize(row['review_text'])), axis=1)
def parse_label(label):
if label == '__label2__':
return 'real'
else:
return 'fake'
def parse_verification(label):
if label == 'N':
return 0
else:
featureDict = {} # A global dictionary of features
def toFeatureVector(tokens, rating, verified):
# Should return a dictionary containing features as keys, and weights as values
v = {}
for t in tokens:
try:
featureDict[t] += 1
except KeyError:
featureDict[t] = 1
featureDict = {} # A global dictionary of features
def toFeatureVector(tokens):
# Should return a dictionary containing features as keys, and weights as values
v = {}
for t in tokens:
try:
featureDict[t] += 1
except KeyError:
featureDict[t] = 1
def crossValidate(dataset, folds):
shuffle(dataset)
predictions = []
ground_truth = []
foldSize = int(len(dataset)/folds)
#preProcess and tokenize once!
dataset = [(t[0], toFeatureVector(preProcess(t[1])), t[2]) for t in dataset]
for i in range(0,len(dataset), foldSize):
trainFolds = dataset[:i] + dataset[i+foldSize:]