rafaljanwojcik

## clustering.py
word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec.model").wv
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

## assigning_clusters.py
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

## tfidf_vectorizer.py
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)

## predictions.py
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

## sentiment_replacement.py
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out


## tfidf_weighting.py
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score

    inspired  by function from this wonderful article:
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

## cleaning.py
from unidecode import unidecode

def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text

## file
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
	word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec.model").wv
	model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
	positive_cluster_center = model.cluster_centers_[0]
	negative_cluster_center = model.cluster_centers_[1]
	words = pd.DataFrame(word_vectors.vocab.keys())
	words.columns = ['words']
	words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
	words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
	words.cluster = words.cluster.apply(lambda x: x[0])
	words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
	words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
	words['sentiment_coeff'] = words.closeness_score * words.cluster_value
	tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
	tfidf.fit(file_weighting.title)
	features = pd.Series(tfidf.get_feature_names())
	transformed = tfidf.transform(file_weighting.title)
	replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
	replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
	replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
	replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
	def replace_sentiment_words(word, sentiment_dict):
	'''
	replacing each word with its associated sentiment score from sentiment dict
	'''
	try:
	out = sentiment_dict[word]
	except KeyError:
	out = 0
	return out
	def create_tfidf_dictionary(x, transformed_file, features):
	'''
	create dictionary for each input sentence x, where each word has assigned its tfidf score

	inspired by function from this wonderful article:
	https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34

	x - row of dataframe, containing sentences, and their indexes,
	transformed_file - all sentences transformed with TfidfVectorizer
	features - names of all words in corpus used in TfidfVectorizer
	from unidecode import unidecode

	def text_to_word_list(text, remove_polish_letters):
	''' Pre process and convert texts to a list of words
	method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
	text = remove_polish_letters(text)
	text = str(text)
	text = text.lower()

	# Clean the text