Lagyamfi

## train_embeddings_5.py
# Read twi data from supplied path to TWI file and preprocess
twi_data = read_dataset(TWI_PATH, NUMBER_OF_DATASET, normalize=True, language="twi")

# create embeddings from preprocessed twi data
embeddings = get_embedding(twi_data, FastText, size = 100, sg=1)

# test to see some similar words to some word using the learned embeddings
input_word = ""
get_similar(input_word, embeddings)

## train_embeddings_4.py
def prepare_for_visualization(model, model_path=None, save_dir="."):
    """
    Generates tsv formats of metadata and tensors/vectors for embeddings.
    Useful for tensorflow embeddings projector.
    """
    if model_path:   # to do -> check correctness of path
        model = gensim.models.KeyedVectors.load_word2vec_format(f"{model_path}", binary=False, encoding="utf-16")
    with open(f"{save_dir}/embedding_tensors.tsv", 'w+') as tensors:
        with open(f"{save_dir}/embedding_metadata.dat", 'w+') as metadata:
            for word in model.wv.index2word:

## train_embeddings_3.py
def get_embedding(data, typeFunc=Word2Vec, size=100, window=5, min_count=5, sg=0, save=False):
    """
    Generate embeddings for input data. Currently works with either [Word2vec or FastText] from gensim

    """
    embeddings  = typeFunc(data,size=size, window=window, min_count=min_count, workers=4, sg=sg)
    if save:
        embeddings.save(f"./{typeFunc.__name__}_embedding.mod")
    return embeddings

## train_embeddings.py
def read_dataset(file_path, number=None, normalize=False, language="eng"):
    """
    Read NUMBER_OF_DATASET lines of data in supplied file_path
    Perform normalization (if normalize=True) based on input language(default:"eng", option:"twi")
    Returns
    -------
    List[list] of processed word tokens for sentences in file_path
    """

    with open(file_path) as file:

## train_embeddings.py
import numpy as np
import unicodedata
import re
import os
from gensim.models import Word2Vec, FastText


NUMBER_OF_DATASET = 10000  # Number of lines in the twi text to read
TWI_PATH = "../jw300.en-tw.tw"  # to be filled with path to twi text file

## gist:55d2e6329b851cce8bc63d69f3d6a014
{
  "embeddings": [
    {
      "tensorName": "Twi Embeddings",
      "tensorShape": [
        3890,
        100
      ],
      "tensorPath": "https://gist.githubusercontent.com/Lagyamfi/83ed29121d9c6a3c55cc43691ab1e6f1/raw/6c80f628fa2b50e883d9163acdb6acbd3f8576af/embedding_tensors.tsv",
      "metadataPath": "https://gist.githubusercontent.com/Lagyamfi/759b66dc72ed60977d7d68047644b1a9/raw/b09723bc18fe03ade7f2a9098677d111211737df/embedding_metadata.dat"

## embedding_metadata.dat
a
no
.
sɛ
na
mu
ne
ho
wɔ
so

## embedding_tensors.tsv
0.00052419817	-0.013428101	-0.18923548	-0.12378584	-0.13348275	-0.19504371	-0.21705475	-0.06131517	-0.21444488	0.06652182	0.02302025	-0.062904164	-0.23141232	-0.042251814	-0.1532043	-0.062490422	0.0032143332	0.16235732	-0.03971875	-0.08317669	-0.039773207	0.1185327	-0.048253365	-0.041885294	-0.062435914	0.030222977	-0.069686696	0.08345175	-0.14653802	0.22368638	-0.17269213	-0.053428203	0.045868214	-0.1745172	0.14802118	-0.16412453	0.0831089	-0.08549252	-0.0844746	0.10793445	-0.02170284	0.25657293	0.010812476	0.018980177	-0.062152766	-0.07008561	0.13627873	0.0975509	-0.12622985	-0.184571	-0.28338507	0.009989383	0.017243551	-0.046570335	0.016103085	-0.03306862	0.066873915	-0.164175	-0.15156694	-0.112727	-0.17696792	-0.14249855	-0.043522142	-0.0874118	0.12529133	-0.19809446	-0.11694473	-0.110769145	0.22212538	-0.034395188	-0.039013393	-0.2554705	-0.18736535	-0.058384836	-0.16850983	0.04507998	0.11389926	-0.10845794	0.029420663	0.02597659	0.0058230497	0.06305347	0.07172061	-0.10133277	0.03858092	0.19234028	0.0204

## gist:550c647e168d93e16b25af00db093995

      
              0 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                Lagyamfi
                / gist:550c647e168d93e16b25af00db093995
            
            
              Created
              May 25, 2020 15:26
            
          
              We couldn’t find that file to show.
	# Read twi data from supplied path to TWI file and preprocess
	twi_data = read_dataset(TWI_PATH, NUMBER_OF_DATASET, normalize=True, language="twi")

	# create embeddings from preprocessed twi data
	embeddings = get_embedding(twi_data, FastText, size = 100, sg=1)

	# test to see some similar words to some word using the learned embeddings
	input_word = ""
	get_similar(input_word, embeddings)
	def prepare_for_visualization(model, model_path=None, save_dir="."):
	"""
	Generates tsv formats of metadata and tensors/vectors for embeddings.
	Useful for tensorflow embeddings projector.
	"""
	if model_path: # to do -> check correctness of path
	model = gensim.models.KeyedVectors.load_word2vec_format(f"{model_path}", binary=False, encoding="utf-16")
	with open(f"{save_dir}/embedding_tensors.tsv", 'w+') as tensors:
	with open(f"{save_dir}/embedding_metadata.dat", 'w+') as metadata:
	for word in model.wv.index2word:
	def get_embedding(data, typeFunc=Word2Vec, size=100, window=5, min_count=5, sg=0, save=False):
	"""
	Generate embeddings for input data. Currently works with either [Word2vec or FastText] from gensim

	"""
	embeddings = typeFunc(data,size=size, window=window, min_count=min_count, workers=4, sg=sg)
	if save:
	embeddings.save(f"./{typeFunc.__name__}_embedding.mod")
	return embeddings
	def read_dataset(file_path, number=None, normalize=False, language="eng"):
	"""
	Read NUMBER_OF_DATASET lines of data in supplied file_path
	Perform normalization (if normalize=True) based on input language(default:"eng", option:"twi")
	Returns
	-------
	List[list] of processed word tokens for sentences in file_path
	"""

	with open(file_path) as file:
	import numpy as np
	import unicodedata
	import re
	import os
	from gensim.models import Word2Vec, FastText


	NUMBER_OF_DATASET = 10000 # Number of lines in the twi text to read
	TWI_PATH = "../jw300.en-tw.tw" # to be filled with path to twi text file
	{
	"embeddings": [
	{
	"tensorName": "Twi Embeddings",
	"tensorShape": [
	3890,
	100
	],
	"tensorPath": "https://gist.githubusercontent.com/Lagyamfi/83ed29121d9c6a3c55cc43691ab1e6f1/raw/6c80f628fa2b50e883d9163acdb6acbd3f8576af/embedding_tensors.tsv",
	"metadataPath": "https://gist.githubusercontent.com/Lagyamfi/759b66dc72ed60977d7d68047644b1a9/raw/b09723bc18fe03ade7f2a9098677d111211737df/embedding_metadata.dat"