jamescalam/example.ipynb Secret

## example.ipynb
{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Install Pinecone Canopy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -qU \\\n",
    "    pinecone-resin \\\n",
    "    datasets"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Set Env Vars"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First we setup our environment variables:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ[\"PINECONE_API_KEY\"] = os.environ.get(\"PINECONE_API_KEY\") or \"...\"\n",
    "os.environ[\"PINECONE_ENVIRONMENT\"] = os.environ.get(\"PINECONE_ENVIRONMENT\") or \"us-west1-gcp\"\n",
    "os.environ[\"INDEX_NAME\"] = os.environ.get(\"INDEX_NAME\") or \"canopy-101\"\n",
    "os.environ[\"OPENAI_API_KEY\"] = os.environ.get(\"OPENAI_API_KEY\") or \"sk-...\""
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Create New Index"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can create a new Canopy index like so:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!resin new"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Create Parquet Files"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Canopy reads local parquet files that contain the fields `[\"id\", \"text\", \"metadata\"]`. We will use the [`jamescalam/ai-arxiv`](https://huggingface.co/datasets/jamescalam/ai-arxiv) dataset. First we download it like so:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'content', 'references'],\n",
       "    num_rows: 423\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "data = load_dataset(\"jamescalam/ai-arxiv\", split=\"train\")\n",
    "data"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then we must format it into the format we need:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "86f5dd3bb1164d038d6bfe286265c5b3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/423 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'text', 'metadata'],\n",
       "    num_rows: 423\n",
       "})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = data.map(lambda x: {\n",
    "    \"text\": x[\"content\"],\n",
    "    \"metadata\": {\n",
    "        \"title\": x[\"title\"],\n",
    "        \"url\": x[\"source\"],\n",
    "        \"primary_category\": x[\"primary_category\"],\n",
    "        \"published\": x[\"published\"],\n",
    "        \"updated\": x[\"updated\"],\n",
    "    }\n",
    "})\n",
    "# drop uneeded columns\n",
    "data = data.remove_columns([\n",
    "    \"title\", \"summary\", \"source\",\n",
    "    \"authors\", \"categories\", \"comment\",\n",
    "    \"journal_ref\", \"primary_category\",\n",
    "    \"published\", \"updated\", \"content\",\n",
    "    \"references\"\n",
    "])\n",
    "data"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then save to parquet:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyarrow.parquet as pq\n",
    "\n",
    "pq.write_table(data.data.table, \"data.parquet\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Upserting Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!resin upsert ./data.parquet"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ml",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 1. Install Pinecone Canopy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install -qU \\\n",
	" pinecone-resin \\\n",
	" datasets"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 2. Set Env Vars"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"First we setup our environment variables:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"\n",
	"os.environ[\"PINECONE_API_KEY\"] = os.environ.get(\"PINECONE_API_KEY\") or \"...\"\n",
	"os.environ[\"PINECONE_ENVIRONMENT\"] = os.environ.get(\"PINECONE_ENVIRONMENT\") or \"us-west1-gcp\"\n",
	"os.environ[\"INDEX_NAME\"] = os.environ.get(\"INDEX_NAME\") or \"canopy-101\"\n",
	"os.environ[\"OPENAI_API_KEY\"] = os.environ.get(\"OPENAI_API_KEY\") or \"sk-...\""
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 3. Create New Index"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We can create a new Canopy index like so:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!resin new"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 4. Create Parquet Files"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Canopy reads local parquet files that contain the fields `[\"id\", \"text\", \"metadata\"]`. We will use the [`jamescalam/ai-arxiv`](https://huggingface.co/datasets/jamescalam/ai-arxiv) dataset. First we download it like so:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Dataset({\n",
	" features: ['id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'content', 'references'],\n",
	" num_rows: 423\n",
	"})"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from datasets import load_dataset\n",
	"\n",
	"data = load_dataset(\"jamescalam/ai-arxiv\", split=\"train\")\n",
	"data"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Then we must format it into the format we need:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "86f5dd3bb1164d038d6bfe286265c5b3",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"Map: 0%\| \| 0/423 [00:00<?, ? examples/s]"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"data": {
	"text/plain": [
	"Dataset({\n",
	" features: ['id', 'text', 'metadata'],\n",
	" num_rows: 423\n",
	"})"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data = data.map(lambda x: {\n",
	" \"text\": x[\"content\"],\n",
	" \"metadata\": {\n",
	" \"title\": x[\"title\"],\n",
	" \"url\": x[\"source\"],\n",
	" \"primary_category\": x[\"primary_category\"],\n",
	" \"published\": x[\"published\"],\n",
	" \"updated\": x[\"updated\"],\n",
	" }\n",
	"})\n",
	"# drop uneeded columns\n",
	"data = data.remove_columns([\n",
	" \"title\", \"summary\", \"source\",\n",
	" \"authors\", \"categories\", \"comment\",\n",
	" \"journal_ref\", \"primary_category\",\n",
	" \"published\", \"updated\", \"content\",\n",
	" \"references\"\n",
	"])\n",
	"data"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Then save to parquet:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pyarrow.parquet as pq\n",
	"\n",
	"pq.write_table(data.data.table, \"data.parquet\")"
	]
	},
	{
	"attachments": {},
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 5. Upserting Dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!resin upsert ./data.parquet"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "ml",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.12"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}