This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def calculate_matrix_cosine_similarity(matrix1, matrix2): | |
return np.dot(matrix1, matrix2.T) / (np.linalg.norm(matrix1, axis=1) * np.linalg.norm(matrix2, axis=1)).reshape(-1, 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
def flat_list_of_lists(inputs): | |
return itertools.chain.from_iterable(inputs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def calculate_mean_vector(embedding): | |
return np.mean(embedding[list(embedding.vocab)], axis=0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def calculate_vector_cosine_similarity(vector1, vector2): | |
return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import linecache | |
from gensim.models import KeyedVectors | |
def load_embedding_with_gensim(embedding_name): | |
''' | |
Load embeddings with gensim. | |
''' | |
if embedding_name.endswith('bin'): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
parser = argparse.ArgumentParser() | |
args_list = lambda x:list(map(str, x.split(','))) | |
parser.add_argument('--inputs', type=args_list) | |
args = parser.parse_args() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--input', type=str, required=True) | |
args = parser.parse_args() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
import argparse | |
import numpy as np | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--input', type=str, required=True) | |
parser.add_argument('--output', type=str, required=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import regex as re | |
def split_sentence_to_words(sent): | |
pat = re.compile(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") | |
return re.findall(pat, sent) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import linecache | |
from gensim.models import KeyedVectors | |
def save_word_embedding_text_to_binary(input, output): | |
if linecache.getline(input, 1).split() == 2: | |
no_header = False | |
else: |
NewerOlder