This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Read twi data from supplied path to TWI file and preprocess | |
| twi_data = read_dataset(TWI_PATH, NUMBER_OF_DATASET, normalize=True, language="twi") | |
| # create embeddings from preprocessed twi data | |
| embeddings = get_embedding(twi_data, FastText, size = 100, sg=1) | |
| # test to see some similar words to some word using the learned embeddings | |
| input_word = "" | |
| get_similar(input_word, embeddings) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def prepare_for_visualization(model, model_path=None, save_dir="."): | |
| """ | |
| Generates tsv formats of metadata and tensors/vectors for embeddings. | |
| Useful for tensorflow embeddings projector. | |
| """ | |
| if model_path: # to do -> check correctness of path | |
| model = gensim.models.KeyedVectors.load_word2vec_format(f"{model_path}", binary=False, encoding="utf-16") | |
| with open(f"{save_dir}/embedding_tensors.tsv", 'w+') as tensors: | |
| with open(f"{save_dir}/embedding_metadata.dat", 'w+') as metadata: | |
| for word in model.wv.index2word: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_embedding(data, typeFunc=Word2Vec, size=100, window=5, min_count=5, sg=0, save=False): | |
| """ | |
| Generate embeddings for input data. Currently works with either [Word2vec or FastText] from gensim | |
| """ | |
| embeddings = typeFunc(data,size=size, window=window, min_count=min_count, workers=4, sg=sg) | |
| if save: | |
| embeddings.save(f"./{typeFunc.__name__}_embedding.mod") | |
| return embeddings |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def read_dataset(file_path, number=None, normalize=False, language="eng"): | |
| """ | |
| Read NUMBER_OF_DATASET lines of data in supplied file_path | |
| Perform normalization (if normalize=True) based on input language(default:"eng", option:"twi") | |
| Returns | |
| ------- | |
| List[list] of processed word tokens for sentences in file_path | |
| """ | |
| with open(file_path) as file: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import unicodedata | |
| import re | |
| import os | |
| from gensim.models import Word2Vec, FastText | |
| NUMBER_OF_DATASET = 10000 # Number of lines in the twi text to read | |
| TWI_PATH = "../jw300.en-tw.tw" # to be filled with path to twi text file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "embeddings": [ | |
| { | |
| "tensorName": "Twi Embeddings", | |
| "tensorShape": [ | |
| 3890, | |
| 100 | |
| ], | |
| "tensorPath": "https://gist.githubusercontent.com/Lagyamfi/83ed29121d9c6a3c55cc43691ab1e6f1/raw/6c80f628fa2b50e883d9163acdb6acbd3f8576af/embedding_tensors.tsv", | |
| "metadataPath": "https://gist.githubusercontent.com/Lagyamfi/759b66dc72ed60977d7d68047644b1a9/raw/b09723bc18fe03ade7f2a9098677d111211737df/embedding_metadata.dat" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| a | |
| no | |
| . | |
| sɛ | |
| na | |
| mu | |
| ne | |
| ho | |
| wɔ | |
| so |
We can't make this file beautiful and searchable because it's too large.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| 0.00052419817 -0.013428101 -0.18923548 -0.12378584 -0.13348275 -0.19504371 -0.21705475 -0.06131517 -0.21444488 0.06652182 0.02302025 -0.062904164 -0.23141232 -0.042251814 -0.1532043 -0.062490422 0.0032143332 0.16235732 -0.03971875 -0.08317669 -0.039773207 0.1185327 -0.048253365 -0.041885294 -0.062435914 0.030222977 -0.069686696 0.08345175 -0.14653802 0.22368638 -0.17269213 -0.053428203 0.045868214 -0.1745172 0.14802118 -0.16412453 0.0831089 -0.08549252 -0.0844746 0.10793445 -0.02170284 0.25657293 0.010812476 0.018980177 -0.062152766 -0.07008561 0.13627873 0.0975509 -0.12622985 -0.184571 -0.28338507 0.009989383 0.017243551 -0.046570335 0.016103085 -0.03306862 0.066873915 -0.164175 -0.15156694 -0.112727 -0.17696792 -0.14249855 -0.043522142 -0.0874118 0.12529133 -0.19809446 -0.11694473 -0.110769145 0.22212538 -0.034395188 -0.039013393 -0.2554705 -0.18736535 -0.058384836 -0.16850983 0.04507998 0.11389926 -0.10845794 0.029420663 0.02597659 0.0058230497 0.06305347 0.07172061 -0.10133277 0.03858092 0.19234028 0.0204 |