Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save davidalbertonogueira/a7becd3ce6ff93f1da848e2393033941 to your computer and use it in GitHub Desktop.
Save davidalbertonogueira/a7becd3ce6ff93f1da848e2393033941 to your computer and use it in GitHub Desktop.
alignment to english of Fast text embeddings (using alignment matrices from Babylonpartners/fastText_multilingual)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Align vector spaces to English (Multilingual word vectors)!"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from fasttext import FastVector\n",
"\n",
"# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy\n",
"def normalized(a, axis=-1, order=2):\n",
" \"\"\"Utility function to normalize the rows of a numpy array.\"\"\"\n",
" l2 = np.atleast_1d(np.linalg.norm(a, order, axis))\n",
" l2[l2==0] = 1\n",
" return a / np.expand_dims(l2, axis)\n",
"\n",
"def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):\n",
" \"\"\"\n",
" Source and target dictionaries are the FastVector objects of\n",
" source/target languages. bilingual_dictionary is a list of \n",
" translation pair tuples [(source_word, target_word), ...].\n",
" \"\"\"\n",
" source_matrix = []\n",
" target_matrix = []\n",
"\n",
" for (source, target) in bilingual_dictionary:\n",
" if source in source_dictionary and target in target_dictionary:\n",
" source_matrix.append(source_dictionary[source])\n",
" target_matrix.append(target_dictionary[target])\n",
"\n",
" # return training matrices\n",
" return np.array(source_matrix), np.array(target_matrix)\n",
"\n",
"def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):\n",
" \"\"\"\n",
" Source and target matrices are numpy arrays, shape\n",
" (dictionary_length, embedding_dimension). These contain paired\n",
" word vectors from the bilingual dictionary.\n",
" \"\"\"\n",
" # optionally normalize the training vectors\n",
" if normalize_vectors:\n",
" source_matrix = normalized(source_matrix)\n",
" target_matrix = normalized(target_matrix)\n",
"\n",
" # perform the SVD\n",
" product = np.matmul(source_matrix.transpose(), target_matrix)\n",
" U, s, V = np.linalg.svd(product)\n",
"\n",
" # return orthogonal transformation which aligns source language to the target\n",
" return np.matmul(U, V)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"class Downloder():\n",
" def download_manager(self, url, destination='', try_number=\"10\", time_out=\"60\"):\n",
" #threading.Thread(target=self._wget_dl, args=(url, destination, try_number, time_out, log_file)).start()\n",
" if self._wget_dl(url, destination, try_number, time_out, log_file) == 0:\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"\n",
" def _wget_dl(self,url, destination, try_number, time_out):\n",
" import subprocess\n",
" command=[\"wget\", \"-c\", \"-P\", destination, \"-t\", try_number, \"-T\", time_out , url]\n",
" try:\n",
" download_state=subprocess.call(command)\n",
" except Exception as e:\n",
" print(e)\n",
" #if download_state==0 => successfull download\n",
" return download_state"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"16\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ar/wiki.ar.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/da/wiki.da.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/de/wiki.de.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/el/wiki.el.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/en/wiki.en.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/es/wiki.es.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fa/wiki.fa.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/hu/wiki.hu.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/fr/wiki.fr.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/it/wiki.it.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ja/wiki.ja.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/lv/wiki.lv.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/pt/wiki.pt.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/ru/wiki.ru.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/tr/wiki.tr.vec\n",
"reading word vectors from /mnt/hdd_disk/dan/datasets/embeddings_fastText/uk/wiki.uk.vec\n",
"16\n"
]
}
],
"source": [
"languages=[\"ar\",\n",
"\"da\",\n",
"\"de\",\n",
"\"el\",\n",
"\"en\",\n",
"\"es\",\n",
"\"fa\",\n",
"\"hu\",\n",
"\"fr\",\n",
"\"it\",\n",
"\"ja\",\n",
"\"lv\",\n",
"\"pt\",\n",
"\"ru\",\n",
"\"tr\",\n",
"\"uk\"]\n",
"print(len(languages))\n",
"\n",
"languages_dictionary={}\n",
"\n",
"path_prefix = \"/mnt/hdd_disk/dan/datasets/embeddings_fastText/\"\n",
"for language in languages:\n",
" folder_path = path_prefix + language + \"/\"\n",
" file_path = path_prefix + language + \"/wiki.\"+language+\".vec\"\n",
" import os \n",
" if not os.path.isfile(file_path) :\n",
" if os.path.isdir(folder_path) :\n",
" os.mkdir(folder_path)\n",
" downloader = Downloder()\n",
" downloader._wget_dl(\"https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.\"+language+\".vec\", folder_path)\n",
" languages_dictionary[language]=FastVector(vector_file=file_path)\n",
"\n",
"print(len(languages_dictionary))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-0.014731081984914952\n"
]
}
],
"source": [
"fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n",
"uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n",
"print(FastVector.cosine_similarity(fa_vector, uk_vector))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\"گربه\" and \"кіт\" both mean \"cat\", so they should be highly similar; clearly the two word vector spaces are not yet aligned. To align them, ..."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"for language, language_dictionary in languages_dictionary.items():\n",
" language_dictionary.apply_transform('alignment_matrices/'+language+'.txt')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.500099345287656\n"
]
}
],
"source": [
"fa_vector = languages_dictionary[\"fa\"][\"گربه\"]\n",
"uk_vector = languages_dictionary[\"uk\"][\"кіт\"]\n",
"\n",
"print(FastVector.cosine_similarity(fa_vector, uk_vector))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"for language, language_dictionary in languages_dictionary.items():\n",
" target_path = path_prefix + language + '/multiling_'+language+'.vec')\n",
" import os \n",
" if not os.path.isfile(target_path) :\n",
" language_dictionary.export(target_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment