This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') | |
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') | |
STOPWORDS = set(stopwords.words('english')) | |
def clean_text(text): | |
""" | |
text: a string | |
return: modified initial string | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfTransformer | |
nb = Pipeline([('vect', CountVectorizer()), | |
('tfidf', TfidfTransformer()), | |
('clf', MultinomialNB()), | |
]) | |
nb.fit(X_train, y_train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import SGDClassifier | |
sgd = Pipeline([('vect', CountVectorizer()), | |
('tfidf', TfidfTransformer()), | |
('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)), | |
]) | |
sgd.fit(X_train, y_train) | |
%%time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LogisticRegression | |
logreg = Pipeline([('vect', CountVectorizer()), | |
('tfidf', TfidfTransformer()), | |
('clf', LogisticRegression(n_jobs=1, C=1e5)), | |
]) | |
logreg.fit(X_train, y_train) | |
%%time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def word_averaging(wv, words): | |
all_words, mean = set(), [] | |
for word in words: | |
if isinstance(word, np.ndarray): | |
mean.append(word) | |
elif word in wv.vocab: | |
mean.append(wv.syn0norm[wv.vocab[word].index]) | |
all_words.add(wv.vocab[word].index) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def w2v_tokenize_text(text): | |
tokens = [] | |
for sent in nltk.sent_tokenize(text, language='english'): | |
for word in nltk.word_tokenize(sent, language='english'): | |
if len(word) < 2: | |
continue | |
tokens.append(word) | |
return tokens | |
train, test = train_test_split(df, test_size=0.3, random_state = 42) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tqdm import tqdm | |
tqdm.pandas(desc="progress-bar") | |
from gensim.models import Doc2Vec | |
from sklearn import utils | |
import gensim | |
from gensim.models.doc2vec import TaggedDocument | |
import re | |
def label_sentences(corpus, label_type): | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065) | |
model_dbow.build_vocab([x for x in tqdm(all_data)]) | |
for epoch in range(30): | |
model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1) | |
model_dbow.alpha -= 0.002 | |
model_dbow.min_alpha = model_dbow.alpha |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_vectors(model, corpus_size, vectors_size, vectors_type): | |
""" | |
Get vectors from trained doc2vec model | |
:param doc2vec_model: Trained Doc2Vec model | |
:param corpus_size: Size of the data | |
:param vectors_size: Size of the embedding vectors | |
:param vectors_type: Training or Testing vectors | |
:return: list of vectors | |
""" | |
vectors = np.zeros((corpus_size, vectors_size)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import os | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from sklearn.preprocessing import LabelBinarizer, LabelEncoder |