Skip to content

Instantly share code, notes, and snippets.

@Lagyamfi
Lagyamfi / train_embeddings_5.py
Created June 10, 2020 22:58
train_embeddings_5.py
# Read twi data from supplied path to TWI file and preprocess
twi_data = read_dataset(TWI_PATH, NUMBER_OF_DATASET, normalize=True, language="twi")
# create embeddings from preprocessed twi data
embeddings = get_embedding(twi_data, FastText, size = 100, sg=1)
# test to see some similar words to some word using the learned embeddings
input_word = ""
get_similar(input_word, embeddings)
@Lagyamfi
Lagyamfi / train_embeddings_4.py
Created June 10, 2020 22:55
train_embeddings_4.py
def prepare_for_visualization(model, model_path=None, save_dir="."):
"""
Generates tsv formats of metadata and tensors/vectors for embeddings.
Useful for tensorflow embeddings projector.
"""
if model_path: # to do -> check correctness of path
model = gensim.models.KeyedVectors.load_word2vec_format(f"{model_path}", binary=False, encoding="utf-16")
with open(f"{save_dir}/embedding_tensors.tsv", 'w+') as tensors:
with open(f"{save_dir}/embedding_metadata.dat", 'w+') as metadata:
for word in model.wv.index2word:
@Lagyamfi
Lagyamfi / train_embeddings_3.py
Created June 10, 2020 22:53
train_embeddings_3.py
def get_embedding(data, typeFunc=Word2Vec, size=100, window=5, min_count=5, sg=0, save=False):
"""
Generate embeddings for input data. Currently works with either [Word2vec or FastText] from gensim
"""
embeddings = typeFunc(data,size=size, window=window, min_count=min_count, workers=4, sg=sg)
if save:
embeddings.save(f"./{typeFunc.__name__}_embedding.mod")
return embeddings
@Lagyamfi
Lagyamfi / train_embeddings.py
Created June 10, 2020 22:50
train_embeddings.py
def read_dataset(file_path, number=None, normalize=False, language="eng"):
"""
Read NUMBER_OF_DATASET lines of data in supplied file_path
Perform normalization (if normalize=True) based on input language(default:"eng", option:"twi")
Returns
-------
List[list] of processed word tokens for sentences in file_path
"""
with open(file_path) as file:
@Lagyamfi
Lagyamfi / train_embeddings.py
Last active June 10, 2020 22:47
train_embeddings.py
import numpy as np
import unicodedata
import re
import os
from gensim.models import Word2Vec, FastText
NUMBER_OF_DATASET = 10000 # Number of lines in the twi text to read
TWI_PATH = "../jw300.en-tw.tw" # to be filled with path to twi text file
{
"embeddings": [
{
"tensorName": "Twi Embeddings",
"tensorShape": [
3890,
100
],
"tensorPath": "https://gist.githubusercontent.com/Lagyamfi/83ed29121d9c6a3c55cc43691ab1e6f1/raw/6c80f628fa2b50e883d9163acdb6acbd3f8576af/embedding_tensors.tsv",
"metadataPath": "https://gist.githubusercontent.com/Lagyamfi/759b66dc72ed60977d7d68047644b1a9/raw/b09723bc18fe03ade7f2a9098677d111211737df/embedding_metadata.dat"
a
no
.
na
mu
ne
ho
so
We can't make this file beautiful and searchable because it's too large.
0.00052419817 -0.013428101 -0.18923548 -0.12378584 -0.13348275 -0.19504371 -0.21705475 -0.06131517 -0.21444488 0.06652182 0.02302025 -0.062904164 -0.23141232 -0.042251814 -0.1532043 -0.062490422 0.0032143332 0.16235732 -0.03971875 -0.08317669 -0.039773207 0.1185327 -0.048253365 -0.041885294 -0.062435914 0.030222977 -0.069686696 0.08345175 -0.14653802 0.22368638 -0.17269213 -0.053428203 0.045868214 -0.1745172 0.14802118 -0.16412453 0.0831089 -0.08549252 -0.0844746 0.10793445 -0.02170284 0.25657293 0.010812476 0.018980177 -0.062152766 -0.07008561 0.13627873 0.0975509 -0.12622985 -0.184571 -0.28338507 0.009989383 0.017243551 -0.046570335 0.016103085 -0.03306862 0.066873915 -0.164175 -0.15156694 -0.112727 -0.17696792 -0.14249855 -0.043522142 -0.0874118 0.12529133 -0.19809446 -0.11694473 -0.110769145 0.22212538 -0.034395188 -0.039013393 -0.2554705 -0.18736535 -0.058384836 -0.16850983 0.04507998 0.11389926 -0.10845794 0.029420663 0.02597659 0.0058230497 0.06305347 0.07172061 -0.10133277 0.03858092 0.19234028 0.0204
We couldn’t find that file to show.