Created
May 19, 2020 13:26
-
-
Save allanbatista/66fe0f2ab3f4c34909c5e3443f15f278 to your computer and use it in GitHub Desktop.
Word Pertubation Tensorflow.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Word Pertubation Tensorflow.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyOj2P/ZfSeY7xYxVIcLOUEd", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/allanbatista/66fe0f2ab3f4c34909c5e3443f15f278/word-pertubation-tensorflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tLLLrt96_yK4", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import tensorflow as tf\n", | |
"\n", | |
"\n", | |
"def uniform_random_drop_sequence_fn(sequence_size, ration=0.1):\n", | |
" \"\"\"\n", | |
" essa irá remover N elementos da sequencia e irá substituir por um novo elemento (zero) no final da sequencia\n", | |
"\n", | |
" obs.: essa função só funciona com right padding.\n", | |
"\n", | |
" how its works.: o rato roeu o roupa do rei de roma -> o rato roeu o roupa do de roma\n", | |
" \"\"\"\n", | |
"\n", | |
" ration = tf.constant(ration, dtype=tf.float32)\n", | |
" min_quantity = tf.constant(1, dtype=tf.float32)\n", | |
"\n", | |
" def _uniform_random_drop_sequence(sequence):\n", | |
" reduce_sum = tf.reduce_sum(sequence, axis=1)\n", | |
" sequence_clean = tf.boolean_mask(sequence, tf.cast(reduce_sum, dtype=tf.bool))\n", | |
" total_nonzero = tf.math.count_nonzero(reduce_sum)\n", | |
"\n", | |
" drop_quantity = tf.math.floor(tf.multiply(tf.cast(total_nonzero, dtype=tf.float32), ration))\n", | |
" drop_quantity = tf.reduce_max([drop_quantity, min_quantity])\n", | |
" drop_quantity = tf.cast(drop_quantity, dtype=tf.int32)\n", | |
"\n", | |
" drop_mask_mask = tf.argsort(tf.random.uniform([total_nonzero])) >= drop_quantity\n", | |
"\n", | |
" sequence_dropped = tf.boolean_mask(sequence_clean, drop_mask_mask)\n", | |
"\n", | |
" right_pad_size = sequence_size - tf.math.count_nonzero(tf.reduce_sum(sequence_dropped, axis=1))\n", | |
" right_pad = tf.zeros((right_pad_size, sequence.get_shape()[1]))\n", | |
"\n", | |
" return tf.concat([sequence_dropped, right_pad], axis=0)\n", | |
"\n", | |
" return _uniform_random_drop_sequence\n", | |
"\n", | |
"\n", | |
"def uniform_random_drop_sequences(sequences, sequence_size, ration=0.1):\n", | |
" return tf.map_fn(uniform_random_drop_sequence_fn(sequence_size=sequence_size, ration=ration), sequences, dtype=sequences.dtype)\n", | |
"\n", | |
"\n", | |
"def uniform_random_swap_sequence(sequence_size, ration=0.1):\n", | |
" \"\"\"\n", | |
" essa função altera a posição entre duas palavras randomicamente.\n", | |
"\n", | |
" how its works.: o rato roeu o roupa do rei de roma -> o [roeu rato] o roupa do rei [roma de]\n", | |
"\n", | |
" \"\"\"\n", | |
" ration = tf.constant(ration, dtype=tf.float32)\n", | |
" min_quantity = tf.constant(1, dtype=tf.float32)\n", | |
"\n", | |
" def _uniform_random_swap_sequence(sequence):\n", | |
" reduce_sum = tf.reduce_sum(sequence, axis=1)\n", | |
" total_nonzero = tf.math.count_nonzero(reduce_sum)\n", | |
"\n", | |
" swap_quantity = tf.math.floor(tf.multiply(tf.cast(total_nonzero, dtype=tf.float32), ration))\n", | |
" swap_quantity = tf.reduce_max([swap_quantity, min_quantity])\n", | |
" swap_quantity = tf.cast(swap_quantity, dtype=tf.int32)\n", | |
"\n", | |
" indexies_to_swap = tf.argsort(tf.random.uniform([sequence_size - 1]))[:swap_quantity]\n", | |
"\n", | |
" indexies = tf.range(sequence_size)\n", | |
" indexies = tf.tensor_scatter_nd_update(indexies, tf.reshape(indexies_to_swap, (-1, 1)), indexies_to_swap + 1)\n", | |
" indexies = tf.tensor_scatter_nd_update(indexies, tf.reshape(indexies_to_swap + 1, (-1, 1)), indexies_to_swap)\n", | |
"\n", | |
" return tf.gather(sequence, indexies)\n", | |
"\n", | |
" return _uniform_random_swap_sequence\n", | |
"\n", | |
"\n", | |
"def uniform_random_swap_sequences(sequences, sequence_size, ration=0.1):\n", | |
" return tf.map_fn(uniform_random_swap_sequence(sequence_size=sequence_size, ration=ration), sequences,\n", | |
" dtype=sequences.dtype)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_tDHJZWpCu32", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 458 | |
}, | |
"outputId": "f204cd3b-349a-45bd-b6df-7695e24307de" | |
}, | |
"source": [ | |
"sequences = tf.stack([tf.concat([tf.round(tf.random.uniform((i, 2)) * 10), tf.zeros((5-i, 2))], axis=0) for i in range(2, 6)])\n", | |
"sequences" | |
], | |
"execution_count": 74, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: shape=(4, 5, 2), dtype=float32, numpy=\n", | |
"array([[[ 0., 7.],\n", | |
" [ 8., 8.],\n", | |
" [ 0., 0.],\n", | |
" [ 0., 0.],\n", | |
" [ 0., 0.]],\n", | |
"\n", | |
" [[ 8., 4.],\n", | |
" [ 9., 8.],\n", | |
" [ 0., 1.],\n", | |
" [ 0., 0.],\n", | |
" [ 0., 0.]],\n", | |
"\n", | |
" [[ 6., 9.],\n", | |
" [ 0., 3.],\n", | |
" [ 1., 5.],\n", | |
" [ 1., 6.],\n", | |
" [ 0., 0.]],\n", | |
"\n", | |
" [[ 2., 5.],\n", | |
" [ 8., 1.],\n", | |
" [ 0., 1.],\n", | |
" [ 7., 10.],\n", | |
" [ 4., 2.]]], dtype=float32)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 74 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "O-bBVD3vBV1f", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 458 | |
}, | |
"outputId": "8aee0a12-aa6a-43ed-9368-79f90ec3f29d" | |
}, | |
"source": [ | |
"uniform_random_drop_sequence = uniform_random_drop_sequences(sequences, sequence_size=sequences.get_shape()[1], ration=0.2)" | |
], | |
"execution_count": 62, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: shape=(4, 5, 2), dtype=float32, numpy=\n", | |
"array([[[0.37371004, 0.32919097],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ]],\n", | |
"\n", | |
" [[0.054371 , 0.04614568],\n", | |
" [0.65778756, 0.3740257 ],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ]],\n", | |
"\n", | |
" [[0.34164703, 0.3842957 ],\n", | |
" [0.24133039, 0.05708456],\n", | |
" [0.8817282 , 0.74781 ],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ]],\n", | |
"\n", | |
" [[0.4453119 , 0.26886618],\n", | |
" [0.38716066, 0.72081804],\n", | |
" [0.46648932, 0.9998658 ],\n", | |
" [0.65702033, 0.6006948 ],\n", | |
" [0. , 0. ]]], dtype=float32)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 62 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0H4gFolpE9Ba", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 458 | |
}, | |
"outputId": "2d9e944a-955c-4a4f-9a2c-9c862614e67f" | |
}, | |
"source": [ | |
"uniform_random_swap_sequences(sequences, sequence_size=sequences.get_shape()[1], ration=0.2)" | |
], | |
"execution_count": 63, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<tf.Tensor: shape=(4, 5, 2), dtype=float32, numpy=\n", | |
"array([[[0.37371004, 0.32919097],\n", | |
" [0.85325074, 0.54138684],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ]],\n", | |
"\n", | |
" [[0.054371 , 0.04614568],\n", | |
" [0.65778756, 0.3740257 ],\n", | |
" [0.7399832 , 0.44001162],\n", | |
" [0. , 0. ],\n", | |
" [0. , 0. ]],\n", | |
"\n", | |
" [[0.144961 , 0.7626959 ],\n", | |
" [0.24133039, 0.05708456],\n", | |
" [0.34164703, 0.3842957 ],\n", | |
" [0.8817282 , 0.74781 ],\n", | |
" [0. , 0. ]],\n", | |
"\n", | |
" [[0.4453119 , 0.26886618],\n", | |
" [0.38716066, 0.72081804],\n", | |
" [0.46648932, 0.9998658 ],\n", | |
" [0.75947475, 0.3422799 ],\n", | |
" [0.65702033, 0.6006948 ]]], dtype=float32)>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 63 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ibfv86ubGK3i", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment