yamini/classify_freetext_with_nlp_yelp.ipynb

## 13 changes: 12 additions & 1 deletion classify_freetext_with_nlp_yelp.ipynb
@@ -5,7 +5,8 @@

        "colab": {
    "colab": {

          "name": "classify_freetext_with_nlp_yelp.ipynb",
      "name": "classify_freetext_with_nlp_yelp.ipynb",

          "provenance": [],
      "provenance": [],

          "collapsed_sections": []
      "collapsed_sections": []

          "collapsed_sections": [],
      "collapsed_sections": [],

          "include_colab_link": true
      "include_colab_link": true

        },
    },

        "interpreter": {
    "interpreter": {

          "hash": "c75c16813cbc397c72b905dbcc7225ac3e9db4701ee07b7887dce2bbda1e49e9"
      "hash": "c75c16813cbc397c72b905dbcc7225ac3e9db4701ee07b7887dce2bbda1e49e9"
@@ -28,6 +29,16 @@

        }
    }

      },
  },

      "cells": [
  "cells": [

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "view-in-github",
        "id": "view-in-github",

            "colab_type": "text"
        "colab_type": "text"

          },
      },

          "source": [
      "source": [

            "<a href=\"https://colab.research.google.com/gist/yamini/715c90fd625ab6aaf58ee443bfbefc42/classify_freetext_with_nlp_yelp.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
        "<a href=\"https://colab.research.google.com/gist/yamini/715c90fd625ab6aaf58ee443bfbefc42/classify_freetext_with_nlp_yelp.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"

          ]
      ]

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {


## 361 changes: 361 additions & 0 deletions classify_freetext_with_nlp_yelp.ipynb
@@ -0,0 +1,361 @@

    {
{

      "nbformat": 4,
  "nbformat": 4,

      "nbformat_minor": 0,
  "nbformat_minor": 0,

      "metadata": {
  "metadata": {

        "colab": {
    "colab": {

          "name": "classify_freetext_with_nlp_yelp.ipynb",
      "name": "classify_freetext_with_nlp_yelp.ipynb",

          "provenance": [],
      "provenance": [],

          "collapsed_sections": []
      "collapsed_sections": []

        },
    },

        "interpreter": {
    "interpreter": {

          "hash": "c75c16813cbc397c72b905dbcc7225ac3e9db4701ee07b7887dce2bbda1e49e9"
      "hash": "c75c16813cbc397c72b905dbcc7225ac3e9db4701ee07b7887dce2bbda1e49e9"

        },
    },

        "kernelspec": {
    "kernelspec": {

          "display_name": "Python 3.9.6 64-bit ('blueprints': virtualenv)",
      "display_name": "Python 3.9.6 64-bit ('blueprints': virtualenv)",

          "name": "python3"
      "name": "python3"

        },
    },

        "language_info": {
    "language_info": {

          "codemirror_mode": {
      "codemirror_mode": {

            "name": "ipython",
        "name": "ipython",

            "version": 3
        "version": 3

          },
      },

          "file_extension": ".py",
      "file_extension": ".py",

          "mimetype": "text/x-python",
      "mimetype": "text/x-python",

          "name": "python",
      "name": "python",

          "nbconvert_exporter": "python",
      "nbconvert_exporter": "python",

          "pygments_lexer": "ipython3",
      "pygments_lexer": "ipython3",

          "version": "3.9.6"
      "version": "3.9.6"

        }
    }

      },
  },

      "cells": [
  "cells": [

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "aIBl7LPg0Zzc"
        "id": "aIBl7LPg0Zzc"

          },
      },

          "source": [
      "source": [

            "# Using Gretel Classify to Label Free Text\n",
        "# Using Gretel Classify to Label Free Text\n",

            "\n",
        "\n",

            "In this blueprint, we analyze and label a set of Yelp reviews looking for PII and other potentially sensitive information. "
        "In this blueprint, we analyze and label a set of Yelp reviews looking for PII and other potentially sensitive information. "

          ]
      ]

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "5zlWDeUZ0Zzd"
        "id": "5zlWDeUZ0Zzd"

          },
      },

          "source": [
      "source": [

            "## Setup\n",
        "## Setup\n",

            "\n",
        "\n",

            "First we install our python dependencies and configure the Gretel client.\n",
        "First we install our python dependencies and configure the Gretel client.\n",

            "\n",
        "\n",

            "_Note: we install spacy for their visualization helper, displacy_"
        "_Note: we install spacy for their visualization helper, displacy_"

          ]
      ]

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "mmcTAKie0Zze"
        "id": "mmcTAKie0Zze"

          },
      },

          "source": [
      "source": [

            "!pip install -Uqq gretel-client spacy datasets"
        "!pip install -Uqq gretel-client spacy datasets"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "6DuZ3OP-0Zzf"
        "id": "6DuZ3OP-0Zzf"

          },
      },

          "source": [
      "source": [

            "\n",
        "\n",

            "from getpass import getpass\n",
        "from getpass import getpass\n",

            "import json\n",
        "import json\n",

            "import datasets\n",
        "import datasets\n",

            "import pandas as pd\n",
        "import pandas as pd\n",

            "import yaml\n",
        "import yaml\n",

            "from smart_open import open\n",
        "from smart_open import open\n",

            "from gretel_client import create_project, poll, configure_session, ClientConfig\n",
        "from gretel_client import create_project, poll, configure_session, ClientConfig\n",

            "\n",
        "\n",

            "pd.set_option('max_colwidth', None)\n",
        "pd.set_option('max_colwidth', None)\n",

            "\n",
        "\n",

            "dataset_file_path = \"reviews.csv\"\n",
        "dataset_file_path = \"reviews.csv\"\n",

            "\n",
        "\n",

            "configure_session(\n",
        "configure_session(\n",

            "    ClientConfig(\n",
        "    ClientConfig(\n",

            "        api_key=getpass(prompt=\"Enter Gretel API key\"),\n",
        "        api_key=getpass(prompt=\"Enter Gretel API key\"),\n",

            "        endpoint=\"https://api.gretel.cloud\",\n",
        "        endpoint=\"https://api.gretel.cloud\",\n",

            "    )\n",
        "    )\n",

            ")"
        ")"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "kDNRpc-l0Zzf"
        "id": "kDNRpc-l0Zzf"

          },
      },

          "source": [
      "source": [

            "## Load the dataset\n",
        "## Load the dataset\n",

            "\n",
        "\n",

            "Using Hugging Face's [datasets](https://github.com/huggingface/datasets) library, we load a dataset containing a dump of [Yelp reviews](https://huggingface.co/datasets/yelp_review_full). This data contains unstructured review text that we pass through a NER pipeline for labeling and PII discovery."
        "Using Hugging Face's [datasets](https://github.com/huggingface/datasets) library, we load a dataset containing a dump of [Yelp reviews](https://huggingface.co/datasets/yelp_review_full). This data contains unstructured review text that we pass through a NER pipeline for labeling and PII discovery."

          ]
      ]

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "dw1QMDr40Zzg"
        "id": "dw1QMDr40Zzg"

          },
      },

          "source": [
      "source": [

            "source_dataset = datasets.load_dataset(\"yelp_review_full\")\n",
        "source_dataset = datasets.load_dataset(\"yelp_review_full\")\n",

            "source_df = pd.DataFrame(source_dataset[\"train\"]).sample(n=300, random_state=99)\n",
        "source_df = pd.DataFrame(source_dataset[\"train\"]).sample(n=300, random_state=99)\n",

            "source_df.to_csv(dataset_file_path, index=False)"
        "source_df.to_csv(dataset_file_path, index=False)"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "2ywqtTxf0Zzh"
        "id": "2ywqtTxf0Zzh"

          },
      },

          "source": [
      "source": [

            "source_df.head()"
        "source_df.head()"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "HoClMb1E0Zzh"
        "id": "HoClMb1E0Zzh"

          },
      },

          "source": [
      "source": [

            "## Configure a Gretel Project and Model"
        "## Configure a Gretel Project and Model"

          ]
      ]

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "7VI-uVY70Zzi"
        "id": "7VI-uVY70Zzi"

          },
      },

          "source": [
      "source": [

            "project = create_project(display_name=\"Gretel NLP Yelp Reviews\")"
        "project = create_project(display_name=\"Gretel NLP Yelp Reviews\")"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "S0l3tqQX0Zzi"
        "id": "S0l3tqQX0Zzi"

          },
      },

          "source": [
      "source": [

            "# Passing `use_nlp: true` into the model config,\n",
        "# Passing `use_nlp: true` into the model config,\n",

            "# enables additional predictions using NLP models.\n",
        "# enables additional predictions using NLP models.\n",

            "classify_config = \"\"\"\n",
        "classify_config = \"\"\"\n",

            "schema_version: \"1.0\"\n",
        "schema_version: \"1.0\"\n",

            "models:\n",
        "models:\n",

            "  - classify:\n",
        "  - classify:\n",

            "      data_source: \"_\"\n",
        "      data_source: \"_\"\n",

            "      use_nlp: true\n",
        "      use_nlp: true\n",

            "\"\"\""
        "\"\"\""

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "2pvl7qbr0Zzj"
        "id": "2pvl7qbr0Zzj"

          },
      },

          "source": [
      "source": [

            "If you wish to transform the dataset instead, you may pass the same `use_nlp: true` property into a transformation pipeline. For an example of a transform pipeline, see the [Redact PII Notebook](https://github.com/gretelai/gretel-blueprints/blob/main/docs/notebooks/redact_pii.ipynb). Below is an example that uses nlp.\n",
        "If you wish to transform the dataset instead, you may pass the same `use_nlp: true` property into a transformation pipeline. For an example of a transform pipeline, see the [Redact PII Notebook](https://github.com/gretelai/gretel-blueprints/blob/main/docs/notebooks/redact_pii.ipynb). Below is an example that uses nlp.\n",

            "\n",
        "\n",

            "```yaml\n",
        "```yaml\n",

            "schema_version: \"1.0\"\n",
        "schema_version: \"1.0\"\n",

            "models:\n",
        "models:\n",

            "  - transforms:\n",
        "  - transforms:\n",

            "      data_source: \"_\"\n",
        "      data_source: \"_\"\n",

            "      use_nlp: true\n",
        "      use_nlp: true\n",

            "      policies:\n",
        "      policies:\n",

            "        - name: remove_pii\n",
        "        - name: remove_pii\n",

            "          rules:\n",
        "          rules:\n",

            "            - name: redact_pii\n",
        "            - name: redact_pii\n",

            "              conditions: \n",
        "              conditions: \n",

            "                value_label:\n",
        "                value_label:\n",

            "                  - person_name\n",
        "                  - person_name\n",

            "                  - location\n",
        "                  - location\n",

            "                  - credit_card_number\n",
        "                  - credit_card_number\n",

            "                  - phone_number\n",
        "                  - phone_number\n",

            "                  - email_address\n",
        "                  - email_address\n",

            "              transforms:\n",
        "              transforms:\n",

            "                - type: fake\n",
        "                - type: fake\n",

            "                - type: redact_with_char\n",
        "                - type: redact_with_char\n",

            "                  attrs:\n",
        "                  attrs:\n",

            "                    char: X\n",
        "                    char: X\n",

            "```"
        "```"

          ]
      ]

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "R1LAysyo0Zzj"
        "id": "R1LAysyo0Zzj"

          },
      },

          "source": [
      "source": [

            "### Create the Classification Model\n",
        "### Create the Classification Model\n",

            "\n",
        "\n",

            "This next cell will create the classification model. After we verify the model is working correctly, the the entire dataset will be passed into the model for classification."
        "This next cell will create the classification model. After we verify the model is working correctly, the the entire dataset will be passed into the model for classification."

          ]
      ]

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "a0jjxsWu0Zzk"
        "id": "a0jjxsWu0Zzk"

          },
      },

          "source": [
      "source": [

            "model = project.create_model_obj(yaml.safe_load(classify_config), dataset_file_path)\n",
        "model = project.create_model_obj(yaml.safe_load(classify_config), dataset_file_path)\n",

            "model.submit_cloud()\n",
        "model.submit_cloud()\n",

            "poll(model)"
        "poll(model)"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "By5EcgYP0Zzk"
        "id": "By5EcgYP0Zzk"

          },
      },

          "source": [
      "source": [

            "Using the created model, we download the report to get a summary view of found entities. This report is based on a sample of the original dataset, and is used to ensure the model has been configured correctly."
        "Using the created model, we download the report to get a summary view of found entities. This report is based on a sample of the original dataset, and is used to ensure the model has been configured correctly."

          ]
      ]

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "D76WDvpM0Zzk"
        "id": "D76WDvpM0Zzk"

          },
      },

          "source": [
      "source": [

            "# `report_json` contains a summary of entities by field\n",
        "# `report_json` contains a summary of entities by field\n",

            "with open(model.get_artifact_link(\"report_json\")) as fh:\n",
        "with open(model.get_artifact_link(\"report_json\")) as fh:\n",

            "    report = json.load(fh)"
        "    report = json.load(fh)"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "25FPWSd40Zzl"
        "id": "25FPWSd40Zzl"

          },
      },

          "source": [
      "source": [

            "# By converting these summaries into a dataframe we can quickly view\n",
        "# By converting these summaries into a dataframe we can quickly view\n",

            "# entities found by the model.\n",
        "# entities found by the model.\n",

            "summary = []\n",
        "summary = []\n",

            "for field in report[\"metadata\"][\"fields\"]:\n",
        "for field in report[\"metadata\"][\"fields\"]:\n",

            "    row = {\"name\": field[\"name\"]}\n",
        "    row = {\"name\": field[\"name\"]}\n",

            "    for entity in field[\"entities\"]:\n",
        "    for entity in field[\"entities\"]:\n",

            "        row[entity[\"label\"]] = entity[\"count\"]\n",
        "        row[entity[\"label\"]] = entity[\"count\"]\n",

            "    summary.append(row)\n",
        "    summary.append(row)\n",

            "\n",
        "\n",

            "pd.DataFrame(summary).set_index(\"name\").fillna(0)"
        "pd.DataFrame(summary).set_index(\"name\").fillna(0)"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "markdown",
      "cell_type": "markdown",

          "metadata": {
      "metadata": {

            "id": "otkUQlrf0Zzl"
        "id": "otkUQlrf0Zzl"

          },
      },

          "source": [
      "source": [

            "### Classify the reviews\n",
        "### Classify the reviews\n",

            "\n",
        "\n",

            "Now that the model has been configured and verified, let's run the full dataset through the model."
        "Now that the model has been configured and verified, let's run the full dataset through the model."

          ]
      ]

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "vQz1bRNc0Zzl"
        "id": "vQz1bRNc0Zzl"

          },
      },

          "source": [
      "source": [

            "records = model.create_record_handler_obj(data_source=dataset_file_path)\n",
        "records = model.create_record_handler_obj(data_source=dataset_file_path)\n",

            "records.submit_cloud()\n",
        "records.submit_cloud()\n",

            "poll(records)"
        "poll(records)"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "8dJdAA2C0Zzm"
        "id": "8dJdAA2C0Zzm"

          },
      },

          "source": [
      "source": [

            "# the `data` artifact returns a JSONL formatted file containing\n",
        "# the `data` artifact returns a JSONL formatted file containing\n",

            "# entity predictions by row.\n",
        "# entity predictions by row.\n",

            "with open(records.get_artifact_link(\"data\")) as fh:\n",
        "with open(records.get_artifact_link(\"data\")) as fh:\n",

            "    records = [json.loads(line) for line in fh]"
        "    records = [json.loads(line) for line in fh]"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "colab": {
        "colab": {

              "background_save": true
          "background_save": true

            },
        },

            "id": "AN2mQqyW0Zzm"
        "id": "AN2mQqyW0Zzm"

          },
      },

          "source": [
      "source": [

            "from IPython.display import clear_output\n",
        "from IPython.display import clear_output\n",

            "from spacy import displacy\n",
        "from spacy import displacy\n",

            "\n",
        "\n",

            "\n",
        "\n",

            "for row, entities in zip(source_df.values, records):\n",
        "for row, entities in zip(source_df.values, records):\n",

            "    label, text = row\n",
        "    label, text = row\n",

            "\n",
        "\n",

            "    colors = {}\n",
        "    colors = {}\n",

            "    palette = ['#7aecec', '#bfeeb7', '#feca74', '#ff9561', '#aa9cfc', '#c887fb', \n",
        "    palette = ['#7aecec', '#bfeeb7', '#feca74', '#ff9561', '#aa9cfc', '#c887fb', \n",

            "               '#9cc9cc', '#ffeb80', '#ff8197', '#ff8197', '#f0d0ff', '#bfe1d9', \n",
        "               '#9cc9cc', '#ffeb80', '#ff8197', '#ff8197', '#f0d0ff', '#bfe1d9', \n",

            "               '#e4e7d2']\n",
        "               '#e4e7d2']\n",

            "\n",
        "\n",

            "    for index, label in enumerate([x[\"label\"] for x in entities['entities']]):\n",
        "    for index, label in enumerate([x[\"label\"] for x in entities['entities']]):\n",

            "        colors[label.upper()] = palette[index % len(palette)]\n",
        "        colors[label.upper()] = palette[index % len(palette)]\n",

            "\n",
        "\n",

            "    options = {\"ents\": list(colors.keys()), \"colors\": colors}\n",
        "    options = {\"ents\": list(colors.keys()), \"colors\": colors}\n",

            "\n",
        "\n",

            "    displacy.render(\n",
        "    displacy.render(\n",

            "        {\n",
        "        {\n",

            "            \"text\": text,\n",
        "            \"text\": text,\n",

            "            \"ents\": [e for e in entities[\"entities\"] if e[\"field\"] == \"text\"]\n",
        "            \"ents\": [e for e in entities[\"entities\"] if e[\"field\"] == \"text\"]\n",

            "        },\n",
        "        },\n",

            "        style=\"ent\",\n",
        "        style=\"ent\",\n",

            "        jupyter=True,\n",
        "        jupyter=True,\n",

            "        manual=True,\n",
        "        manual=True,\n",

            "        options=options\n",
        "        options=options\n",

            "    )\n",
        "    )\n",

            "    input(\"\\nPress [enter] to see the next review\")\n"
        "    input(\"\\nPress [enter] to see the next review\")\n"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        },
    },

        {
    {

          "cell_type": "code",
      "cell_type": "code",

          "metadata": {
      "metadata": {

            "id": "eIraRtkI0Zzm"
        "id": "eIraRtkI0Zzm"

          },
      },

          "source": [
      "source": [

            "# now that you've run the notebook, you can also view the same\n",
        "# now that you've run the notebook, you can also view the same\n",

            "# project using our web console.\n",
        "# project using our web console.\n",

            "project.get_console_url()"
        "project.get_console_url()"

          ],
      ],

          "execution_count": null,
      "execution_count": null,

          "outputs": []
      "outputs": []

        }
    }

      ]
  ]

    }
}