Skip to content

Instantly share code, notes, and snippets.

@egenc
Last active February 9, 2023 15:41
Show Gist options
  • Save egenc/beb43c993ca3b53e23e6c27a2a22dfec to your computer and use it in GitHub Desktop.
Save egenc/beb43c993ca3b53e23e6c27a2a22dfec to your computer and use it in GitHub Desktop.
inscribe_task.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/egenc/beb43c993ca3b53e23e6c27a2a22dfec/inscribe_task.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SMKHXEGBHEfw"
},
"source": [
"# 1. Read Data and Assign Labels"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "iWC1kLOCHyXZ"
},
"outputs": [],
"source": [
"INPUT_1 = \"/content/drive/MyDrive/dataset/companies.txt\"\n",
"INPUT_2 = \"/content/drive/MyDrive/dataset/individuals.txt\"\n",
"\n",
"with open(INPUT_1, \"r\") as in_1:\n",
" data1 = in_1.read().split(\"\\n\")\n",
"\n",
"with open(INPUT_2, \"r\") as in_2:\n",
" data2 = in_2.read().split(\"\\n\")\n",
"\n",
"d1 = {'input_text': data1, 'label': [0 for ele in data1]}\n",
"d2 = {'input_text': data2, 'label': [1 for ele in data2]}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_ou6DLGmHbmg"
},
"source": [
"Checking Companies with label 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "fbIi78-XLPQ5",
"outputId": "38656dde-2c74-4035-c79d-9bf73a2154bb"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" input_text label\n",
"0 Albuquerque Dsm Club 0\n",
"1 Teriyaki Hawaii 0\n",
"2 Allure Electrolysis Waxing 0"
],
"text/html": [
"\n",
" <div id=\"df-d97d909d-c00c-4394-8987-a3189ae15e9e\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input_text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Albuquerque Dsm Club</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Teriyaki Hawaii</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Allure Electrolysis Waxing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d97d909d-c00c-4394-8987-a3189ae15e9e')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-d97d909d-c00c-4394-8987-a3189ae15e9e button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-d97d909d-c00c-4394-8987-a3189ae15e9e');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 2
}
],
"source": [
"import pandas as pd\n",
"\n",
"df1 = pd.DataFrame.from_dict(d1)\n",
"df1.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nW0sWMXIKH4x"
},
"source": [
"## 1.1 Data Cleaning"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_KdFFXsqHrlp"
},
"source": [
"Dropping duplicates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "o2n0IfyPOq2h",
"outputId": "9c6d140e-c15a-438a-a705-d4bd15844769"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"161056\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"161056"
]
},
"metadata": {},
"execution_count": 3
}
],
"source": [
"print(len(df1))\n",
"df1 = df1.drop_duplicates()\n",
"len(df1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-4gxa3CZHuYB"
},
"source": [
"Checking Individuals with label 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "jFUFMFDfFQVh",
"outputId": "3076c61f-a0c2-4221-e85a-1abd2b03421f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" input_text label\n",
"0 David Kagan 1\n",
"1 Angel Donchev 1\n",
"2 Erika Parisi 1"
],
"text/html": [
"\n",
" <div id=\"df-badffb3b-39b5-4ecd-9b14-d5cbae3e4975\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input_text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>David Kagan</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Angel Donchev</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Erika Parisi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-badffb3b-39b5-4ecd-9b14-d5cbae3e4975')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-badffb3b-39b5-4ecd-9b14-d5cbae3e4975 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-badffb3b-39b5-4ecd-9b14-d5cbae3e4975');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 4
}
],
"source": [
"df2 = pd.DataFrame.from_dict(d2)\n",
"df2.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "10SCFgTNH1tM"
},
"source": [
"Dropping duplicates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sN8gR-ltPy9U",
"outputId": "2ab2f3b0-2cce-4678-faca-450b3c99b3c0"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"94247\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"90883"
]
},
"metadata": {},
"execution_count": 5
}
],
"source": [
"print(len(df2))\n",
"df2 = df2.drop_duplicates()\n",
"len(df2)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hF27daSpH6HN"
},
"source": [
"Concatenating two dataframes and dropping duplicates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FKqknPOZQAJU",
"outputId": "f6e038ba-1d1f-4ddd-da9e-7f298ef8f921"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"251939\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"251665"
]
},
"metadata": {},
"execution_count": 6
}
],
"source": [
"df = pd.concat([df1,df2])\n",
"\n",
"print(len(df))\n",
"df = df.drop_duplicates(subset=\"input_text\",\n",
" keep=False)\n",
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZgZmHTqTQUp8",
"outputId": "ad0e6445-4c99-49d3-adc9-e3da23b713b9"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" input_text label\n",
"0 Albuquerque Dsm Club 0\n",
"1 Teriyaki Hawaii 0\n",
"2 Allure Electrolysis Waxing 0\n",
"---------------\n",
" input_text label\n",
"94243 Mark Dudas 1\n",
"94244 Parag Dixit 1\n",
"94245 Andrew Battista 1\n"
]
}
],
"source": [
"print(df.head(3))\n",
"print(\"-\"*15)\n",
"print(df.tail(3))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "R2E8KqthSShp"
},
"source": [
"Checking NaN values in the DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "O-oqh0j8QqLz",
"outputId": "e5b08cc4-672e-4180-f951-742d6ae6cfaa"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0"
]
},
"metadata": {},
"execution_count": 8
}
],
"source": [
"df.isna().sum().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jE-N31IGTRAy"
},
"source": [
"Checking Empty Strings - These might create noises since they are None and may lead to performance decrease\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49
},
"id": "hnveH8N3RXou",
"outputId": "64f1b44a-c1c0-48bd-95d6-37ae871a74cd"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Empty DataFrame\n",
"Columns: [input_text, label, string_content]\n",
"Index: []"
],
"text/html": [
"\n",
" <div id=\"df-ea237b19-6c0b-4e48-97d0-0d757af7eaf4\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input_text</th>\n",
" <th>label</th>\n",
" <th>string_content</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ea237b19-6c0b-4e48-97d0-0d757af7eaf4')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-ea237b19-6c0b-4e48-97d0-0d757af7eaf4 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-ea237b19-6c0b-4e48-97d0-0d757af7eaf4');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"def check_empty_strings(s) -> bool:\n",
" return len(s.replace(\" \", \"\")) > 0\n",
"\n",
"df[\"string_content\"] = df.apply(lambda row: check_empty_strings(row[\"input_text\"]), axis=1)\n",
"df[df.string_content == False]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "4_CGSvD9TC95",
"outputId": "97348251-4438-459c-d388-daf0154c9324"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" input_text label string_content\n",
"0 Albuquerque Dsm Club 0 True\n",
"1 Teriyaki Hawaii 0 True\n",
"2 Allure Electrolysis Waxing 0 True\n",
"3 Los Primos Auto Sales 0 True\n",
"4 John Wesselius 0 True\n",
"... ... ... ...\n",
"94229 Paul Coulombe 1 True\n",
"94230 Vinod Chavan 1 True\n",
"94231 Nishank Chandawala 1 True\n",
"94232 Zachary Butler 1 True\n",
"94233 Dale Briggs 1 True\n",
"\n",
"[251655 rows x 3 columns]"
],
"text/html": [
"\n",
" <div id=\"df-8ca21b9a-e322-40ad-a90e-1cea1e6a7413\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input_text</th>\n",
" <th>label</th>\n",
" <th>string_content</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Albuquerque Dsm Club</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Teriyaki Hawaii</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Allure Electrolysis Waxing</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Los Primos Auto Sales</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>John Wesselius</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94229</th>\n",
" <td>Paul Coulombe</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94230</th>\n",
" <td>Vinod Chavan</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94231</th>\n",
" <td>Nishank Chandawala</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94232</th>\n",
" <td>Zachary Butler</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94233</th>\n",
" <td>Dale Briggs</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>251655 rows × 3 columns</p>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ca21b9a-e322-40ad-a90e-1cea1e6a7413')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-8ca21b9a-e322-40ad-a90e-1cea1e6a7413 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-8ca21b9a-e322-40ad-a90e-1cea1e6a7413');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 10
}
],
"source": [
"df = df.drop(df[df.string_content == False].index)\n",
"\n",
"df.head(-10)"
]
},
{
"cell_type": "markdown",
"source": [
"Max length of a string in training data is 32. Therefore, there is no need to limit that."
],
"metadata": {
"id": "c4HbADeewucY"
}
},
{
"cell_type": "code",
"source": [
"int(df[\"input_text\"].str.len().max())"
],
"metadata": {
"id": "bx_ZGU9Mwdmx",
"outputId": "bfa77b83-cbcf-415b-8fe2-f9dbcb598517",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"32"
]
},
"metadata": {},
"execution_count": 68
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hOppDSBLIhnu"
},
"source": [
"Removing punctuations from input texts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "Yi5iVFO3NBsF",
"outputId": "334475c1-67c4-4580-80a8-29af1158c671"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" input_text label\n",
"0 albuquerque dsm club 0\n",
"1 teriyaki hawaii 0\n",
"2 allure electrolysis waxing 0\n",
"3 los primos auto sales 0\n",
"4 john wesselius 0\n",
"... ... ...\n",
"94229 paul coulombe 1\n",
"94230 vinod chavan 1\n",
"94231 nishank chandawala 1\n",
"94232 zachary butler 1\n",
"94233 dale briggs 1\n",
"\n",
"[251655 rows x 2 columns]"
],
"text/html": [
"\n",
" <div id=\"df-1ab2fc51-b752-4874-b146-0a4819f31068\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input_text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>albuquerque dsm club</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>teriyaki hawaii</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>allure electrolysis waxing</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>los primos auto sales</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>john wesselius</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94229</th>\n",
" <td>paul coulombe</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94230</th>\n",
" <td>vinod chavan</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94231</th>\n",
" <td>nishank chandawala</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94232</th>\n",
" <td>zachary butler</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94233</th>\n",
" <td>dale briggs</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>251655 rows × 2 columns</p>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1ab2fc51-b752-4874-b146-0a4819f31068')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-1ab2fc51-b752-4874-b146-0a4819f31068 button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-1ab2fc51-b752-4874-b146-0a4819f31068');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"df[\"input_text\"] = df['input_text'].str.lower().replace('[^\\w\\s]','')\n",
"df = df.drop(columns=[\"string_content\"])\n",
"\n",
"df.head(-10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TqieoeiMKQGi"
},
"source": [
"# 2. Modelling"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KQjtZ36YIl0m"
},
"source": [
"We are going to need numerical values instead of texts. Thus, assigning numerical values using **CountVectorizer** . More info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "D537uMC-OeTm"
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"vectorizer = CountVectorizer()\n",
"X = vectorizer.fit_transform(df[\"input_text\"])\n",
"y = df[\"label\"]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Eu7q-BySI2r5"
},
"source": [
"Splitting train and test data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qnrhS68QTNAD"
},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hwHskT6iKTL1"
},
"source": [
"## 2.1.1 LogisticRegression\n",
"\n",
"For binary classification (such as this task), starting with logistic regression is a good choice since it might give us a baseline. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pCtUc9jVPN1d",
"outputId": "6eff41c0-e677-4756-fba4-58fcd8b23891"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[27698, 4489],\n",
" [ 2085, 16061]])"
]
},
"metadata": {},
"execution_count": 14
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn import metrics\n",
"\n",
"logreg = LogisticRegression(random_state=16)\n",
"\n",
"logreg.fit(X_train, y_train)\n",
"\n",
"y_pred = logreg.predict(X_test)\n",
"\n",
"cnf_matrix = metrics.confusion_matrix(y_test, y_pred)\n",
"cnf_matrix"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BDxTHgpEKjZp"
},
"source": [
"## 2.1.2 Visualizing the results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 343
},
"id": "mWkoY-M-QPd6",
"outputId": "14b15f0f-012b-449a-a85c-00c8c612b0d0"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Text(0.5, 257.44, 'Predicted label')"
]
},
"metadata": {},
"execution_count": 15
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
],
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"class_names=[0,1] # referring to COMPANY & CUSTOMER\n",
"fig, ax = plt.subplots()\n",
"tick_marks = np.arange(len(class_names))\n",
"plt.xticks(tick_marks, class_names)\n",
"plt.yticks(tick_marks, class_names)\n",
"sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap=\"YlGnBu\" ,fmt='g')\n",
"ax.xaxis.set_label_position(\"top\")\n",
"plt.tight_layout()\n",
"plt.title('Confusion matrix', y=1.1)\n",
"plt.ylabel('Actual label')\n",
"plt.xlabel('Predicted label')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qpVickbeE2CO"
},
"source": [
"The weighted-averaged F1 score is calculated by taking the mean of all per-class F1 scores while considering each class's support. Support refers to the number of actual occurrences of the class in the dataset.\n",
"\n",
"Therefore, we will take into account the **weighted F1 score**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4prS4Tq1D2zD",
"outputId": "f1f3acfd-3a86-4815-d6cd-9540c65a1d9b"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8709135822543531"
]
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"from sklearn.metrics import f1_score\n",
"\n",
"f1_score(y_test, y_pred, average='weighted')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZQYUP9--JIHo"
},
"source": [
"Hash map for labels and their names"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "TPJQaMNLGoyv"
},
"outputs": [],
"source": [
"res_dict = {0:\"COMPANY\", 1:\"CUSTOMER\"}"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9_cLos1RKwzq"
},
"source": [
"## 2.1.3 Printing wrong results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0swhleR_FN8j",
"outputId": "a72d1e9d-b16d-408f-8bbb-411902f7916e"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[array(['leonard', 'cassert'], dtype='<U31')] has been classified as COMPANY and should be CUSTOMER\n",
"[array(['shelby', 'byler'], dtype='<U31')] has been classified as CUSTOMER and should be COMPANY\n",
"[array(['brien', 'keane'], dtype='<U31')] has been classified as CUSTOMER and should be COMPANY\n",
"[array(['melinda', 'sacks'], dtype='<U31')] has been classified as COMPANY and should be CUSTOMER\n",
"[array(['lance', 'leland', 'provencher'], dtype='<U31')] has been classified as CUSTOMER and should be COMPANY\n"
]
}
],
"source": [
"count = 0\n",
"for input, prediction, label in zip(X_test, y_pred, y_test):\n",
" if count == 5: break\n",
" elif prediction != label:\n",
" count += 1\n",
" print(vectorizer.inverse_transform(input), 'has been classified as ', res_dict[prediction], 'and should be ', res_dict[label])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UHpPEhjRJ9Y7"
},
"source": [
"Getting output feature names for transformation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "us9hzZG-bc_W",
"outputId": "38d4aa50-992d-4ab7-bf6b-9d3e5d5e3bbe"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['00', '000', '00142', ..., 'zywicki', 'zywien', 'zzzs'],\n",
" dtype=object)"
]
},
"metadata": {},
"execution_count": 19
}
],
"source": [
"vectorizer.get_feature_names_out()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "mZPSW-t1K10A"
},
"source": [
"## 2.1.4 Inference on User Inputs"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7-0iWS87K6l0"
},
"source": [
"Inputs need to be normalized in the way we normalized our train and test data.\n",
"- Removing punctuations\n",
"- Lowering text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
},
"id": "ltMSk-7pCsYo",
"outputId": "f9a05d7b-c72e-4356-f03e-8d51564fe27b"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "string"
}
},
"metadata": {},
"execution_count": 20
}
],
"source": [
"import string\n",
"\n",
"string.punctuation"
]
},
{
"cell_type": "markdown",
"source": [
"Following code is preprocessing the custom inputs coming from users.\n",
"It:\n",
"- removes punctuations (above)\n",
"- lowers the text\n",
"- Raise error if length is longer than 32 chars."
],
"metadata": {
"id": "kw-XstrPwER0"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JgcDTA9ZcVrE",
"outputId": "fcb29eec-777c-4cd0-c762-f57d5581574c"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Input text: inscribeai \n",
" Prediction COMPANY\n",
"----------\n",
"Input text: string that \n",
" Prediction COMPANY\n",
"----------\n",
"Input text: erdal genc \n",
" Prediction CUSTOMER\n",
"----------\n",
"Input text: oisín moran \n",
" Prediction CUSTOMER\n",
"----------\n",
"Input text: jp morgan \n",
" Prediction COMPANY\n",
"----------\n",
"Input text: paramount sp \n",
" Prediction COMPANY\n",
"----------\n"
]
}
],
"source": [
"def preprocess_input(List) -> list:\n",
"\n",
" List = [s.lower() for s in List]\n",
" List = [s.translate(str.maketrans('', '', string.punctuation)) for s in List]\n",
" for s in List:\n",
" if len(s) > 32:\n",
" raise Exception(\"One or more strings are longer than 32 chars\")\n",
" return List\n",
"\n",
"def predictor(name_l) -> None:\n",
" name_l = preprocess_input(name_l)\n",
"\n",
" tmp = vectorizer.transform(name_l)\n",
" preds = logreg.predict(tmp)\n",
" \n",
" res = [res_dict[ele] for ele in preds]\n",
" for item in zip(name_l, res):\n",
" print(\"Input text:\", item[0], \"\\n\", \"Prediction\", item[1])\n",
" print(\"-\"*10)\n",
"\n",
"s1 = \"inscribeAI ~\"\n",
"s2 = \"string that\"\n",
"s3 = \"Erdal Genc\"\n",
"s4 = \"Oisín Moran\"\n",
"s5 = \"jp morgan\"\n",
"s6 = \"paramount S&P\"\n",
"\n",
"predictor([s1, s2, s3, s4, s5, s6])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "d6yvryDNLMzl"
},
"source": [
"## 2.2.1 Random Forest\n",
"\n",
"This is a tree based algorithm which may result better predictions (from my experience, tree based models are quite effective on tabular data). Therefore, second approach will be Random Forest. For more:\n",
"https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ee2yaBhBDpDJ",
"outputId": "57249a36-d0ab-44af-eb3d-514f66ef7200"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[28011, 4176],\n",
" [ 4262, 13884]])"
]
},
"metadata": {},
"execution_count": 22
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"# Setting random_state so that we can get same results (consistency in terms of producing same results)\n",
"RFCclf = RandomForestClassifier(n_estimators=50, bootstrap=False, random_state=48)\n",
"\n",
"RFCclf.fit(X_train, y_train)\n",
"\n",
"y_pred = RFCclf.predict(X_test)\n",
"\n",
"cnf_matrix = metrics.confusion_matrix(y_test, y_pred)\n",
"cnf_matrix"
]
},
{
"cell_type": "code",
"source": [
"class_names=[0,1] # referring to COMPANY & CUSTOMER\n",
"fig, ax = plt.subplots()\n",
"tick_marks = np.arange(len(class_names))\n",
"plt.xticks(tick_marks, class_names)\n",
"plt.yticks(tick_marks, class_names)\n",
"sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap=\"YlGnBu\" ,fmt='g')\n",
"ax.xaxis.set_label_position(\"top\")\n",
"plt.tight_layout()\n",
"plt.title('Confusion matrix', y=1.1)\n",
"plt.ylabel('Actual label')\n",
"plt.xlabel('Predicted label')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 343
},
"id": "CeluwNGWoj1X",
"outputId": "95827939-d61f-4aae-9666-b3b89c342768"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Text(0.5, 257.44, 'Predicted label')"
]
},
"metadata": {},
"execution_count": 23
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
],
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"cell_type": "code",
"source": [
"f1_score(y_test, y_pred, average='weighted')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uf6LVw9PooyI",
"outputId": "5dc401c6-79cc-4c97-8b14-7e859c7ec5cb"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.83226923581755"
]
},
"metadata": {},
"execution_count": 24
}
]
},
{
"cell_type": "code",
"source": [
"count = 0\n",
"for input, prediction, label in zip(X_test, y_pred, y_test):\n",
" if count == 5: break\n",
" elif prediction != label:\n",
" count += 1\n",
" print(vectorizer.inverse_transform(input), 'has been classified as ', res_dict[prediction], 'and should be ', res_dict[label])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9QgM-HLFpvbH",
"outputId": "a72f9dc9-e72f-4e2f-8eae-7ab4b2b244b3"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[array(['cassert', 'leonard'], dtype='<U31')] has been classified as COMPANY and should be CUSTOMER\n",
"[array(['byler', 'shelby'], dtype='<U31')] has been classified as CUSTOMER and should be COMPANY\n",
"[array(['hall', 'joy'], dtype='<U31')] has been classified as COMPANY and should be CUSTOMER\n",
"[array(['gunter', 'unruh'], dtype='<U31')] has been classified as CUSTOMER and should be COMPANY\n",
"[array(['brien', 'keane'], dtype='<U31')] has been classified as CUSTOMER and should be COMPANY\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## 2.3.1 SBert\n",
"\n",
"SentenceTransformers is a Python framework for state-of-the-art sentence, text embeddings. "
],
"metadata": {
"id": "zIP1Ca5ys6m7"
}
},
{
"cell_type": "code",
"source": [
"!pip install torch transformers evaluate memory_profiler datasets -q"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dFSl_yRyRY7x",
"outputId": "e5953d67-baa2-4a81-8f9c-aa3bdd51682f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/81.4 KB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h"
]
}
]
},
{
"cell_type": "code",
"source": [
"X = df[\"input_text\"]\n",
"y = df[\"label\"]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)"
],
"metadata": {
"id": "jI9FXDJQ1lzQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## 2.3.2 Preparing Dataset for HuggingFace Dataset Class for Compatibility"
],
"metadata": {
"id": "QmklXAGrTq8y"
}
},
{
"cell_type": "code",
"source": [
"import datasets\n",
"import pandas as pd\n",
"\n",
"\n",
"train_df = pd.DataFrame({\n",
" \"text\" : X_train,\n",
" \"label\" : y_train\n",
"})\n",
"\n",
"test_df = pd.DataFrame({\n",
" \"text\" : X_test,\n",
" \"label\" : y_test\n",
"})\n",
"\n",
"train_dataset = Dataset.from_dict(train_df)\n",
"test_dataset = Dataset.from_dict(test_df)\n",
"my_dataset_dict = datasets.DatasetDict({\"train\":train_dataset,\"test\":test_dataset})"
],
"metadata": {
"id": "F1VxkExFJNeR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"my_dataset_dict"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "f43vP9lML3DO",
"outputId": "040cec9c-b1b3-4f59-8e9c-24a39aac6e03"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['text', 'label'],\n",
" num_rows: 201332\n",
" })\n",
" test: Dataset({\n",
" features: ['text', 'label'],\n",
" num_rows: 50333\n",
" })\n",
"})"
]
},
"metadata": {},
"execution_count": 58
}
]
},
{
"cell_type": "markdown",
"source": [
"## 2.3.3 Using Sentence Embeddings\n",
"\n",
"Dataset consists of multiple tokens, therefore using sentence embeddings instead of token count is better approach. Also, BERT (transformers) keep location information of the tokens.\n",
"Source: https://towardsdatascience.com/sentence-embedding-3053db22ea77"
],
"metadata": {
"id": "jm9RdHtRUPQj"
}
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import random\n",
"from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available\n",
"from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
"from datasets import load_metric\n",
"from sklearn.model_selection import train_test_split\n",
"import pandas as pd\n",
"import numpy as np\n",
"%load_ext memory_profiler\n",
"\n",
"# Call the Tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens', do_lower_case=True)\n",
"\n",
"def preprocess_function(examples):\n",
" return tokenizer(examples[\"text\"], truncation=True)\n",
"\n",
"tokenized_dataset = my_dataset_dict.map(preprocess_function, batched=True)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 155,
"referenced_widgets": [
"9c4fb2d129444963aea420c854eef3fc",
"3b5cacca64fd493585821976258161ea",
"23189cd56a1241a0a380908881212b80",
"5c195943784a4148879a2eba852de73e",
"96c47c1a7a56495b8769b2abc91c7554",
"c267747bc9eb4f479725973e6f5a45e3",
"506588deea78460ea457fcfe500e6573",
"840a8c9797104868a9de57f861911ebd",
"1975d008d30a4ca3886ee5df896b7761",
"80925e9024c0420094607c2c6acb6bdc",
"b080984034b946ed82c2a980142bf692",
"1f6f87f4e0ef4832860c44a70eb1e4b0",
"c5a6012a4db54497aef8b5b504299e18",
"e2433a191b9d439e97194e46c8ef911b",
"5d6825da704e4a669ab5e65025aab34a",
"f1e91c961e804fc4952c0e01821bcfc6",
"5a7feaa348e948c9a49128440581575e",
"516791f2b3c44db289647cf13406ab3a",
"fd29542280ee4e93b98cc6435bac95cc",
"a6318f96b86e4f78b7b83dbdb06650e3",
"3b94b9df7c9d4e2e84bafab06d4c996e",
"9b41efcc8b6e4fb598528a9471c2ce09"
]
},
"id": "HnGRUESts420",
"outputId": "6f46ca75-181e-41a6-e18e-4deecb897023"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The memory_profiler extension is already loaded. To reload it, use:\n",
" %reload_ext memory_profiler\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/202 [00:00<?, ?ba/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "9c4fb2d129444963aea420c854eef3fc"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" 0%| | 0/51 [00:00<?, ?ba/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "1f6f87f4e0ef4832860c44a70eb1e4b0"
}
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"from transformers import DataCollatorWithPadding\n",
"\n",
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
],
"metadata": {
"id": "1s54-PQcMU_R"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Using F1 score"
],
"metadata": {
"id": "-1OVZ3cJVEMt"
}
},
{
"cell_type": "code",
"source": [
"import evaluate\n",
"\n",
"f1_metric = evaluate.load(\"f1\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"2927debb8f1a4f8896e8e3e64672aa3b",
"ee01f994e4884ee289af2076c2418f9c",
"25c4ae34365547afbf0757d7a1983eb9",
"3272acbdf75f4dc5a54058638cd5b3ca",
"6884a271eac1481ca360bc792d61ae48",
"000629234cfe477b971b14ca9cec62bb",
"88cc1d67e50d4539b50334f8661e1910",
"962f9d867fd844c8b4810bf394821f96",
"aeb7ffe019804fb28c64cd406dc1f118",
"c3dbae8b12c64860a2fb9332dd105144",
"661a6a170eb64681816b2ba3d0d664b2"
]
},
"id": "Fh8PG7woONNi",
"outputId": "e01f4d5f-3de2-4669-e461-25bca888600c"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Downloading builder script: 0%| | 0.00/6.77k [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "2927debb8f1a4f8896e8e3e64672aa3b"
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"## 2.3.4 Pytorch as Backend\n",
"\n",
"We are returing tensors with pytorch since it is slightly easier to read. HuggingFace also supports tensorflow for returning tensors."
],
"metadata": {
"id": "to29EmFoVReJ"
}
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
"\n",
"def compute_metrics(eval_pred):\n",
" predictions, labels = eval_pred\n",
" predictions = np.argmax(predictions, axis=1)\n",
" return accuracy.compute(predictions=predictions, references=labels)\n",
"\n",
"id2label = {0: \"COMPANY\", 1: \"CUSTOMER\"}\n",
"label2id = {\"COMPANY\": 0, \"CUSTOMER\": 1}\n",
"\n",
"model = AutoModelForSequenceClassification.from_pretrained(\n",
" \"distilbert-base-uncased\", num_labels=2, id2label=id2label, label2id=label2id\n",
")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 190,
"referenced_widgets": [
"95b0db5c543f43eebf63635730b7037d",
"41118f6465ce48c39dec15b4d1103952",
"43e310d1238a4b409748ec997e8b0862",
"99cbf6b355344cbdb355aef2885f9031",
"402feee4662a433bbcf178a79302a770",
"4f7125ae83804de0a29f4bc79ed64684",
"b96c6005b2d145639ee11f7d884fa60f",
"cb5d668913694d928fa866c141574f45",
"5e4c10ef1d7c489e920cee620fbd3f98",
"6e65358a194b4688810ff2811401500d",
"27dae2fa50ef45678e14b873371e80ae",
"d068bd0702fe4088b8f39a5c950d6e37",
"b526822b801249c4ac5f4c39a8cbb926",
"a89e11a023c7463794f82cf2ac63df37",
"eb1edbcadfd649a0ba0532c85079173a",
"1a3c5460e6de4d8c897e51dea64851da",
"544e750a299545f08b9c5938228d9c12",
"e1f7c2e195324494bee2e45e1930e4f7",
"97a00339991e4f27bae09592918eccd4",
"0cd8237e1fb74ea4ad2c7c5973c9081b",
"b24abd4b909f44eda2347a43b2e8c719",
"6f12fee2b06d41c3855fd50df4c53841"
]
},
"id": "IwUC7vUmOQUi",
"outputId": "6f25bc85-222f-47a2-a76e-1843dbacba84"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/483 [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "95b0db5c543f43eebf63635730b7037d"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"Downloading (…)\"pytorch_model.bin\";: 0%| | 0.00/268M [00:00<?, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "d068bd0702fe4088b8f39a5c950d6e37"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']\n",
"- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Let's train it!"
],
"metadata": {
"id": "wdpJIpAjfwXZ"
}
},
{
"cell_type": "code",
"source": [
"training_args = TrainingArguments(\n",
" output_dir=\"inScribe_model\",\n",
" learning_rate=2e-5,\n",
" per_device_train_batch_size=16,\n",
" per_device_eval_batch_size=16,\n",
" num_train_epochs=2,\n",
" weight_decay=0.01,\n",
" evaluation_strategy=\"epoch\",\n",
" save_strategy=\"epoch\",\n",
" load_best_model_at_end=True\n",
")\n",
"\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=tokenized_dataset[\"train\"],\n",
" eval_dataset=tokenized_dataset[\"test\"],\n",
" tokenizer=tokenizer,\n",
" data_collator=data_collator,\n",
" compute_metrics=compute_metrics,\n",
")\n",
"\n",
"trainer.train()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 740
},
"id": "2kf3KfiRPFZG",
"outputId": "0d7f0e41-62e4-4415-91da-e5e2cd1efc58"
},
"execution_count": null,
"outputs": [
{
"metadata": {
"tags": null
},
"name": "stderr",
"output_type": "stream",
"text": [
"PyTorch: setting up devices\n",
"The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
"The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `DistilBertForSequenceClassification.forward`, you can safely ignore this message.\n",
"/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"***** Running training *****\n",
" Num examples = 201332\n",
" Num Epochs = 2\n",
" Instantaneous batch size per device = 16\n",
" Total train batch size (w. parallel, distributed & accumulation) = 16\n",
" Gradient Accumulation steps = 1\n",
" Total optimization steps = 25168\n",
" Number of trainable parameters = 66955010\n",
"You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='5825' max='25168' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [ 5825/25168 2:07:23 < 7:03:11, 0.76 it/s, Epoch 0.46/2]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Epoch</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='6125' max='25168' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [ 6125/25168 2:13:54 < 6:56:28, 0.76 it/s, Epoch 0.49/2]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Epoch</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table><p>"
]
},
"metadata": {}
},
{
"output_type": "error",
"ename": "KeyboardInterrupt",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-66-b9b00cf08581>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 21\u001b[0m )\n\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1541\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_inner_training_loop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_batch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_find_batch_size\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1542\u001b[0m )\n\u001b[0;32m-> 1543\u001b[0;31m return inner_training_loop(\n\u001b[0m\u001b[1;32m 1544\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1545\u001b[0m \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1856\u001b[0m \u001b[0moptimizer_was_run\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mscale_before\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mscale_after\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1857\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1858\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1859\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1860\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moptimizer_was_run\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdeepspeed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/optim/lr_scheduler.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0minstance\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_step_count\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0mwrapped\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minstance\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapped\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;31m# Note that the returned function here is no longer a bound method,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/optim/optimizer.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[0mprofile_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Optimizer.step#{}.step\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprofiler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecord_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprofile_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 140\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 141\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_optimizer_step_code\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.8/dist-packages/transformers/optimization.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 358\u001b[0m \u001b[0;31m# Decay the first and second moment running average coefficient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;31m# In-place operations to update the averages at the same time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 360\u001b[0;31m \u001b[0mexp_avg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbeta1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1.0\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mbeta1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 361\u001b[0m \u001b[0mexp_avg_sq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbeta2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maddcmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mbeta2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0mdenom\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexp_avg_sq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqrt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"eps\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# 3. Future Steps & Considerations"
],
"metadata": {
"id": "PVJ2Td9Lf19q"
}
},
{
"cell_type": "markdown",
"source": [
"\n",
"1. Logistic Regression performs quite good for a start. We can definitely improve its performance by cleaning the outliers. To find outliers, we need to consider that:\n",
" * It can be a wrong data entry (Eg. human typing error)\n",
" * It can be a data that has values that are not relevant (Eg. an entry of total which is calculated as the sum of the above columns. This data can be misleading at times so it should be removed)\n",
" * It can be a data entry that is all or most fields blank (Eg. a row in the data where all fields are blank. This row maybe not contributing anything to the analysis)\n",
" * It may be extreme values which fall way out of the range of the other data (Eg. length is too long to be a company or customer name)\n",
"\n",
"2. Random Forest is both slower and performing slightly worse. To be sure, we might need to test it again after removing the outliers.\n",
"\n",
"3. Even though model training is not completed yet, I do believe its performance will be better. It is because it is based on BERT and its performance might be considered as SoTA. However, it is a heavy model and not cheap-to-deploy, we might just pick Logistic Regression for the speed.\n",
"\n",
"4. Employing LightGBM and XGBoost might also be effective since they are boosting algorithms and might increase performance. \n",
"\n",
"5. If there was time I would also try TF-IDF. Because, **CountVectorizer** simply counts the number of times a word appears in a document (using a bag-of-words approach), while **TF-IDF** Vectorizer takes into account not only how many times a word appears in a document but also how important that word is to the whole corpus.\n",
"\n",
"6. After trying Logistic Regression, Random Forest, LightGBM, we can convert results to an ensemble model. To be specific, following this tutorial: https://www.geeksforgeeks.org/ensemble-methods-in-python/\n",
"\n",
"7. To deploy these models, I would follow these steps using **FastAPI** For Logistic Regression and Random Forest: https://towardsdatascience.com/colabcode-deploying-machine-learning-models-from-google-colab-54e0d37a7b09\n",
"For HuggingFace Models: \n",
"https://colab.research.google.com/drive/1jrKblK4iISeilrCasc02G8JAd8Z3G45h?usp=sharing\n",
"\n",
"8. I tried not to exceed 4 hours. Training a transformer taking a lot of time. Therefore, I am submitting this task without completing that. \n",
"\n",
"\n"
],
"metadata": {
"id": "i3199G-Xf-43"
}
}
],
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"9c4fb2d129444963aea420c854eef3fc": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_3b5cacca64fd493585821976258161ea",
"IPY_MODEL_23189cd56a1241a0a380908881212b80",
"IPY_MODEL_5c195943784a4148879a2eba852de73e"
],
"layout": "IPY_MODEL_96c47c1a7a56495b8769b2abc91c7554"
}
},
"3b5cacca64fd493585821976258161ea": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c267747bc9eb4f479725973e6f5a45e3",
"placeholder": "​",
"style": "IPY_MODEL_506588deea78460ea457fcfe500e6573",
"value": "100%"
}
},
"23189cd56a1241a0a380908881212b80": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_840a8c9797104868a9de57f861911ebd",
"max": 202,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_1975d008d30a4ca3886ee5df896b7761",
"value": 202
}
},
"5c195943784a4148879a2eba852de73e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_80925e9024c0420094607c2c6acb6bdc",
"placeholder": "​",
"style": "IPY_MODEL_b080984034b946ed82c2a980142bf692",
"value": " 202/202 [00:08&lt;00:00, 20.86ba/s]"
}
},
"96c47c1a7a56495b8769b2abc91c7554": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c267747bc9eb4f479725973e6f5a45e3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"506588deea78460ea457fcfe500e6573": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"840a8c9797104868a9de57f861911ebd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1975d008d30a4ca3886ee5df896b7761": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"80925e9024c0420094607c2c6acb6bdc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b080984034b946ed82c2a980142bf692": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"1f6f87f4e0ef4832860c44a70eb1e4b0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_c5a6012a4db54497aef8b5b504299e18",
"IPY_MODEL_e2433a191b9d439e97194e46c8ef911b",
"IPY_MODEL_5d6825da704e4a669ab5e65025aab34a"
],
"layout": "IPY_MODEL_f1e91c961e804fc4952c0e01821bcfc6"
}
},
"c5a6012a4db54497aef8b5b504299e18": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_5a7feaa348e948c9a49128440581575e",
"placeholder": "​",
"style": "IPY_MODEL_516791f2b3c44db289647cf13406ab3a",
"value": "100%"
}
},
"e2433a191b9d439e97194e46c8ef911b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_fd29542280ee4e93b98cc6435bac95cc",
"max": 51,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_a6318f96b86e4f78b7b83dbdb06650e3",
"value": 51
}
},
"5d6825da704e4a669ab5e65025aab34a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3b94b9df7c9d4e2e84bafab06d4c996e",
"placeholder": "​",
"style": "IPY_MODEL_9b41efcc8b6e4fb598528a9471c2ce09",
"value": " 51/51 [00:01&lt;00:00, 29.13ba/s]"
}
},
"f1e91c961e804fc4952c0e01821bcfc6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5a7feaa348e948c9a49128440581575e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"516791f2b3c44db289647cf13406ab3a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"fd29542280ee4e93b98cc6435bac95cc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"a6318f96b86e4f78b7b83dbdb06650e3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"3b94b9df7c9d4e2e84bafab06d4c996e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"9b41efcc8b6e4fb598528a9471c2ce09": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"2927debb8f1a4f8896e8e3e64672aa3b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_ee01f994e4884ee289af2076c2418f9c",
"IPY_MODEL_25c4ae34365547afbf0757d7a1983eb9",
"IPY_MODEL_3272acbdf75f4dc5a54058638cd5b3ca"
],
"layout": "IPY_MODEL_6884a271eac1481ca360bc792d61ae48"
}
},
"ee01f994e4884ee289af2076c2418f9c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_000629234cfe477b971b14ca9cec62bb",
"placeholder": "​",
"style": "IPY_MODEL_88cc1d67e50d4539b50334f8661e1910",
"value": "Downloading builder script: 100%"
}
},
"25c4ae34365547afbf0757d7a1983eb9": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_962f9d867fd844c8b4810bf394821f96",
"max": 6771,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_aeb7ffe019804fb28c64cd406dc1f118",
"value": 6771
}
},
"3272acbdf75f4dc5a54058638cd5b3ca": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c3dbae8b12c64860a2fb9332dd105144",
"placeholder": "​",
"style": "IPY_MODEL_661a6a170eb64681816b2ba3d0d664b2",
"value": " 6.77k/6.77k [00:00&lt;00:00, 235kB/s]"
}
},
"6884a271eac1481ca360bc792d61ae48": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"000629234cfe477b971b14ca9cec62bb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"88cc1d67e50d4539b50334f8661e1910": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"962f9d867fd844c8b4810bf394821f96": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"aeb7ffe019804fb28c64cd406dc1f118": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"c3dbae8b12c64860a2fb9332dd105144": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"661a6a170eb64681816b2ba3d0d664b2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"95b0db5c543f43eebf63635730b7037d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_41118f6465ce48c39dec15b4d1103952",
"IPY_MODEL_43e310d1238a4b409748ec997e8b0862",
"IPY_MODEL_99cbf6b355344cbdb355aef2885f9031"
],
"layout": "IPY_MODEL_402feee4662a433bbcf178a79302a770"
}
},
"41118f6465ce48c39dec15b4d1103952": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_4f7125ae83804de0a29f4bc79ed64684",
"placeholder": "​",
"style": "IPY_MODEL_b96c6005b2d145639ee11f7d884fa60f",
"value": "Downloading (…)lve/main/config.json: 100%"
}
},
"43e310d1238a4b409748ec997e8b0862": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_cb5d668913694d928fa866c141574f45",
"max": 483,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_5e4c10ef1d7c489e920cee620fbd3f98",
"value": 483
}
},
"99cbf6b355344cbdb355aef2885f9031": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6e65358a194b4688810ff2811401500d",
"placeholder": "​",
"style": "IPY_MODEL_27dae2fa50ef45678e14b873371e80ae",
"value": " 483/483 [00:00&lt;00:00, 14.0kB/s]"
}
},
"402feee4662a433bbcf178a79302a770": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4f7125ae83804de0a29f4bc79ed64684": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b96c6005b2d145639ee11f7d884fa60f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"cb5d668913694d928fa866c141574f45": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"5e4c10ef1d7c489e920cee620fbd3f98": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"6e65358a194b4688810ff2811401500d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"27dae2fa50ef45678e14b873371e80ae": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"d068bd0702fe4088b8f39a5c950d6e37": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_b526822b801249c4ac5f4c39a8cbb926",
"IPY_MODEL_a89e11a023c7463794f82cf2ac63df37",
"IPY_MODEL_eb1edbcadfd649a0ba0532c85079173a"
],
"layout": "IPY_MODEL_1a3c5460e6de4d8c897e51dea64851da"
}
},
"b526822b801249c4ac5f4c39a8cbb926": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_544e750a299545f08b9c5938228d9c12",
"placeholder": "​",
"style": "IPY_MODEL_e1f7c2e195324494bee2e45e1930e4f7",
"value": "Downloading (…)&quot;pytorch_model.bin&quot;;: 100%"
}
},
"a89e11a023c7463794f82cf2ac63df37": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_97a00339991e4f27bae09592918eccd4",
"max": 267967963,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_0cd8237e1fb74ea4ad2c7c5973c9081b",
"value": 267967963
}
},
"eb1edbcadfd649a0ba0532c85079173a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b24abd4b909f44eda2347a43b2e8c719",
"placeholder": "​",
"style": "IPY_MODEL_6f12fee2b06d41c3855fd50df4c53841",
"value": " 268M/268M [00:02&lt;00:00, 106MB/s]"
}
},
"1a3c5460e6de4d8c897e51dea64851da": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"544e750a299545f08b9c5938228d9c12": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e1f7c2e195324494bee2e45e1930e4f7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"97a00339991e4f27bae09592918eccd4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0cd8237e1fb74ea4ad2c7c5973c9081b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"b24abd4b909f44eda2347a43b2e8c719": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6f12fee2b06d41c3855fd50df4c53841": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment