mattppal/mintlify-GPT-4.ipynb

## mintlify-GPT-4.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3a3f88e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "import yaml\n",
    "import glob\n",
    "import markdown\n",
    "from rich.pretty import pprint\n",
    "import json\n",
    "from collections import defaultdict\n",
    "from itertools import chain\n",
    "import frontmatter\n",
    "from dotenv import load_dotenv\n",
    "import tiktoken"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c9d08a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv(\"/Users/mattpalmer/.secrets/openai.env\")\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "openai.organization = os.getenv(\"OPENAI_ORG_ID\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff5f87b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "system_message = \"\"\"\n",
    "You will be provided documentation on a data transformation tool, \"Mage\". \\\n",
    "The content supplied is a page in our documentation and a path to that page \\ \n",
    "relative to the current location in a file system. The content is formatted \\\n",
    "in EITHER \"Markdown\" or \"MDX\" format. Please use standard Markdown/MDX syntax in \\\n",
    "interpreting this content and disregard any frontmatter (content between ---).\n",
    "\n",
    "You should use the path as a hint as to the category of the page, but \\\n",
    "be aware that many paths will be incorrect. The content of the document \\\n",
    "should be used as the primary motivation for the category of the document.\n",
    "\n",
    "Classify the into a primary category and a secondary category. \\\n",
    "Additionally, documents may have a tertiary category, but this is optional. \\\n",
    "Overview pages should never have a tertiary category. If you feel that a \\\n",
    "secondary or tertiary category should exist with certainty, create one. \\\n",
    "Categories are defined in a JSON structure like the following: \\\n",
    "{\"Primary-1\": {\"Secondary-1\": [\"Tertiary-1\", \"Tertiary-2\"], \"Secondary-2\": [\"Tertiary-1\", etc.}... etc}.\n",
    "\n",
    "Please note that \"data integrations\" are distinctly different from \"integrations.\" \\\n",
    "\"data integrations\" refer to a service similar to fivetran or meltano— they move data \\\n",
    "between a source and target. \"Integrations\" refer to Mage-specific integrations, i.e. \\\n",
    "extensions or compatible tools.\n",
    "\n",
    "Provide your output in json format with the keys: current_filepath, primary, secondary, \\\n",
    "and tertiary. For docs lacking a tertiary category, please return an empty string ''\n",
    "\"\"\"\n",
    "\n",
    "categories = {\n",
    "    \"Docs\": {\n",
    "        \"Introduction\": [\"Setup\", \"Development\"],\n",
    "        \"Configuration\": [\n",
    "            \"Storage\",\n",
    "            \"Kernels\",\n",
    "            \"Variables\",\n",
    "            \"Dependencies\",\n",
    "            \"Versioning\",\n",
    "        ],\n",
    "        \"Concepts\": [\"Design\", \"Abstractions\", \"Orchestration\"],\n",
    "        \"dbt\": [\n",
    "            \"Configuration\",\n",
    "            \"Models\",\n",
    "            \"Commands\",\n",
    "        ],\n",
    "        \"Integrations\": [\n",
    "            \"Computation\",\n",
    "            \"Orchestration\",\n",
    "            \"Transformation\",\n",
    "            \"Observability\",\n",
    "            \"Reverse ETL\",\n",
    "        ],\n",
    "        \"About\": [\"Community\"],\n",
    "    },\n",
    "    \"Guides\": {\"Get Started\": [], \"Pipeline Development\": [], \"Blocks\": []},\n",
    "    \"Deploy\": {\n",
    "        \"Get Started\": [],\n",
    "        \"Cloud\": [],\n",
    "        \"CI/CD\": [],\n",
    "        \"Team Management\": [],\n",
    "        \"Version Control\": [],\n",
    "    },\n",
    "    \"Contribute\": {\"Get Started\": [], \"Backend\": [], \"Frontend\": []},\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15958e00",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_completion_from_doc(doc, model=\"gpt-4-0613\", temperature=0.25, max_tokens=8000):\n",
    "    return openai.ChatCompletion.create(\n",
    "        model=model,\n",
    "        messages=doc,\n",
    "        temperature=temperature,\n",
    "        max_tokens=max_tokens,\n",
    "    )\n",
    "\n",
    "def num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\"):\n",
    "    \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n",
    "    try:\n",
    "        encoding = tiktoken.encoding_for_model(model)\n",
    "    except KeyError:\n",
    "        print(\"Warning: model not found. Using cl100k_base encoding.\")\n",
    "        encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
    "    if model in {\n",
    "        \"gpt-3.5-turbo-0613\",\n",
    "        \"gpt-3.5-turbo-16k-0613\",\n",
    "        \"gpt-4-0314\",\n",
    "        \"gpt-4-32k-0314\",\n",
    "        \"gpt-4-0613\",\n",
    "        \"gpt-4-32k-0613\",\n",
    "        }:\n",
    "        tokens_per_message = 3\n",
    "        tokens_per_name = 1\n",
    "    elif model == \"gpt-3.5-turbo-0301\":\n",
    "        tokens_per_message = 4  # every message follows <|start|>{role/name}\\n{content}<|end|>\\n\n",
    "        tokens_per_name = -1  # if there's a name, the role is omitted\n",
    "    elif \"gpt-3.5-turbo\" in model:\n",
    "        print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\")\n",
    "        return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\")\n",
    "    elif \"gpt-4\" in model:\n",
    "        print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
    "        return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
    "    else:\n",
    "        raise NotImplementedError(\n",
    "            f\"\"\"num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.\"\"\"\n",
    "        )\n",
    "    num_tokens = 0\n",
    "    for message in messages:\n",
    "        num_tokens += tokens_per_message\n",
    "        print(tokens_per_message)\n",
    "        for key, value in message.items():\n",
    "            num_tokens += len(encoding.encode(value))\n",
    "            if key == \"name\":\n",
    "                num_tokens += tokens_per_name\n",
    "    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>\n",
    "    return num_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5289cc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import floor\n",
    "\n",
    "def exclude(input_dir):\n",
    "    # we're confident about these OR they break things\n",
    "    if \"/api-reference\" in input_dir:\n",
    "        return False\n",
    "    elif \"/about/\" in input_dir:\n",
    "        return False\n",
    "    elif \"/data-integration\" in input_dir:\n",
    "        return False\n",
    "    else:\n",
    "        return True\n",
    "\n",
    "cat = []\n",
    "for d in list(filter(exclude, glob.glob(\"../git-repos/mage-ai/docs/*/*.md*\")))[0:2]:\n",
    "    with open(d, \"r\") as f:\n",
    "        # remove frontmatter\n",
    "        metadata, content = frontmatter.parse(f.read())\n",
    "\n",
    "        messages = [\n",
    "            {\"role\": \"system\", \"content\": system_message},\n",
    "            {\"role\": \"system\", \"content\": json.dumps(categories)},\n",
    "            {\"role\": \"user\", \"content\": d},\n",
    "            {\"role\": \"user\", \"content\": content[0:3000]},\n",
    "        ]\n",
    "        print(d)\n",
    "        print(num_tokens_from_messages(messages, 'gpt-4-0613'))\n",
    "\n",
    "        response = get_completion_from_doc(messages, temperature=0, max_tokens=8192)\n",
    "        cat.append(json.loads(response.choices[0].message[\"content\"]))\n",
    "        print(json.loads(response.choices[0].message[\"content\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efccd6dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_mint_json(dictionary):\n",
    "    output = []\n",
    "    for key, value in dictionary.items():\n",
    "        # Get items without a tertiary category\n",
    "        if isinstance(value, dict):\n",
    "            t = sum([v for k, v in value.items() if len(k) == 0], [])\n",
    "        else:\n",
    "            t = []\n",
    "\n",
    "        if key != \"group\" and len(key) > 0:\n",
    "            new = {\"group\": key, \"pages\": t}\n",
    "            # recurse to build mint.json\n",
    "            if isinstance(value, dict):\n",
    "                new[\"pages\"] += build_mint_json(value)\n",
    "\n",
    "            # append to pages\n",
    "            elif isinstance(value, list):\n",
    "                new[\"pages\"] += value\n",
    "            output.append(new)\n",
    "\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c6ffd2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tree = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))\n",
    "\n",
    "for f in cat:\n",
    "    p = f[\"primary\"]\n",
    "    s = f[\"secondary\"]\n",
    "    t = f[\"tertiary\"]\n",
    "    fp = f[\"current_filepath\"]\n",
    "\n",
    "    tree[p][s][t].append(fp)\n",
    "\n",
    "mint = build_mint_json(tree)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py3-default",
   "language": "python",
   "name": "pyenv_py3-default"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "a3a3f88e",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import openai\n",
	"import yaml\n",
	"import glob\n",
	"import markdown\n",
	"from rich.pretty import pprint\n",
	"import json\n",
	"from collections import defaultdict\n",
	"from itertools import chain\n",
	"import frontmatter\n",
	"from dotenv import load_dotenv\n",
	"import tiktoken"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3c9d08a1",
	"metadata": {},
	"outputs": [],
	"source": [
	"load_dotenv(\"/Users/mattpalmer/.secrets/openai.env\")\n",
	"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
	"openai.organization = os.getenv(\"OPENAI_ORG_ID\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ff5f87b2",
	"metadata": {},
	"outputs": [],
	"source": [
	"system_message = \"\"\"\n",
	"You will be provided documentation on a data transformation tool, \"Mage\". \\\n",
	"The content supplied is a page in our documentation and a path to that page \\ \n",
	"relative to the current location in a file system. The content is formatted \\\n",
	"in EITHER \"Markdown\" or \"MDX\" format. Please use standard Markdown/MDX syntax in \\\n",
	"interpreting this content and disregard any frontmatter (content between ---).\n",
	"\n",
	"You should use the path as a hint as to the category of the page, but \\\n",
	"be aware that many paths will be incorrect. The content of the document \\\n",
	"should be used as the primary motivation for the category of the document.\n",
	"\n",
	"Classify the into a primary category and a secondary category. \\\n",
	"Additionally, documents may have a tertiary category, but this is optional. \\\n",
	"Overview pages should never have a tertiary category. If you feel that a \\\n",
	"secondary or tertiary category should exist with certainty, create one. \\\n",
	"Categories are defined in a JSON structure like the following: \\\n",
	"{\"Primary-1\": {\"Secondary-1\": [\"Tertiary-1\", \"Tertiary-2\"], \"Secondary-2\": [\"Tertiary-1\", etc.}... etc}.\n",
	"\n",
	"Please note that \"data integrations\" are distinctly different from \"integrations.\" \\\n",
	"\"data integrations\" refer to a service similar to fivetran or meltano— they move data \\\n",
	"between a source and target. \"Integrations\" refer to Mage-specific integrations, i.e. \\\n",
	"extensions or compatible tools.\n",
	"\n",
	"Provide your output in json format with the keys: current_filepath, primary, secondary, \\\n",
	"and tertiary. For docs lacking a tertiary category, please return an empty string ''\n",
	"\"\"\"\n",
	"\n",
	"categories = {\n",
	" \"Docs\": {\n",
	" \"Introduction\": [\"Setup\", \"Development\"],\n",
	" \"Configuration\": [\n",
	" \"Storage\",\n",
	" \"Kernels\",\n",
	" \"Variables\",\n",
	" \"Dependencies\",\n",
	" \"Versioning\",\n",
	" ],\n",
	" \"Concepts\": [\"Design\", \"Abstractions\", \"Orchestration\"],\n",
	" \"dbt\": [\n",
	" \"Configuration\",\n",
	" \"Models\",\n",
	" \"Commands\",\n",
	" ],\n",
	" \"Integrations\": [\n",
	" \"Computation\",\n",
	" \"Orchestration\",\n",
	" \"Transformation\",\n",
	" \"Observability\",\n",
	" \"Reverse ETL\",\n",
	" ],\n",
	" \"About\": [\"Community\"],\n",
	" },\n",
	" \"Guides\": {\"Get Started\": [], \"Pipeline Development\": [], \"Blocks\": []},\n",
	" \"Deploy\": {\n",
	" \"Get Started\": [],\n",
	" \"Cloud\": [],\n",
	" \"CI/CD\": [],\n",
	" \"Team Management\": [],\n",
	" \"Version Control\": [],\n",
	" },\n",
	" \"Contribute\": {\"Get Started\": [], \"Backend\": [], \"Frontend\": []},\n",
	"}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "15958e00",
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_completion_from_doc(doc, model=\"gpt-4-0613\", temperature=0.25, max_tokens=8000):\n",
	" return openai.ChatCompletion.create(\n",
	" model=model,\n",
	" messages=doc,\n",
	" temperature=temperature,\n",
	" max_tokens=max_tokens,\n",
	" )\n",
	"\n",
	"def num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\"):\n",
	" \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n",
	" try:\n",
	" encoding = tiktoken.encoding_for_model(model)\n",
	" except KeyError:\n",
	" print(\"Warning: model not found. Using cl100k_base encoding.\")\n",
	" encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
	" if model in {\n",
	" \"gpt-3.5-turbo-0613\",\n",
	" \"gpt-3.5-turbo-16k-0613\",\n",
	" \"gpt-4-0314\",\n",
	" \"gpt-4-32k-0314\",\n",
	" \"gpt-4-0613\",\n",
	" \"gpt-4-32k-0613\",\n",
	" }:\n",
	" tokens_per_message = 3\n",
	" tokens_per_name = 1\n",
	" elif model == \"gpt-3.5-turbo-0301\":\n",
	" tokens_per_message = 4 # every message follows <\|start\|>{role/name}\\n{content}<\|end\|>\\n\n",
	" tokens_per_name = -1 # if there's a name, the role is omitted\n",
	" elif \"gpt-3.5-turbo\" in model:\n",
	" print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.\")\n",
	" return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0613\")\n",
	" elif \"gpt-4\" in model:\n",
	" print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
	" return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
	" else:\n",
	" raise NotImplementedError(\n",
	" f\"\"\"num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.\"\"\"\n",
	" )\n",
	" num_tokens = 0\n",
	" for message in messages:\n",
	" num_tokens += tokens_per_message\n",
	" print(tokens_per_message)\n",
	" for key, value in message.items():\n",
	" num_tokens += len(encoding.encode(value))\n",
	" if key == \"name\":\n",
	" num_tokens += tokens_per_name\n",
	" num_tokens += 3 # every reply is primed with <\|start\|>assistant<\|message\|>\n",
	" return num_tokens"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "b5289cc4",
	"metadata": {},
	"outputs": [],
	"source": [
	"from math import floor\n",
	"\n",
	"def exclude(input_dir):\n",
	" # we're confident about these OR they break things\n",
	" if \"/api-reference\" in input_dir:\n",
	" return False\n",
	" elif \"/about/\" in input_dir:\n",
	" return False\n",
	" elif \"/data-integration\" in input_dir:\n",
	" return False\n",
	" else:\n",
	" return True\n",
	"\n",
	"cat = []\n",
	"for d in list(filter(exclude, glob.glob(\"../git-repos/mage-ai/docs//.md*\")))[0:2]:\n",
	" with open(d, \"r\") as f:\n",
	" # remove frontmatter\n",
	" metadata, content = frontmatter.parse(f.read())\n",
	"\n",
	" messages = [\n",
	" {\"role\": \"system\", \"content\": system_message},\n",
	" {\"role\": \"system\", \"content\": json.dumps(categories)},\n",
	" {\"role\": \"user\", \"content\": d},\n",
	" {\"role\": \"user\", \"content\": content[0:3000]},\n",
	" ]\n",
	" print(d)\n",
	" print(num_tokens_from_messages(messages, 'gpt-4-0613'))\n",
	"\n",
	" response = get_completion_from_doc(messages, temperature=0, max_tokens=8192)\n",
	" cat.append(json.loads(response.choices[0].message[\"content\"]))\n",
	" print(json.loads(response.choices[0].message[\"content\"]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "efccd6dd",
	"metadata": {},
	"outputs": [],
	"source": [
	"def build_mint_json(dictionary):\n",
	" output = []\n",
	" for key, value in dictionary.items():\n",
	" # Get items without a tertiary category\n",
	" if isinstance(value, dict):\n",
	" t = sum([v for k, v in value.items() if len(k) == 0], [])\n",
	" else:\n",
	" t = []\n",
	"\n",
	" if key != \"group\" and len(key) > 0:\n",
	" new = {\"group\": key, \"pages\": t}\n",
	" # recurse to build mint.json\n",
	" if isinstance(value, dict):\n",
	" new[\"pages\"] += build_mint_json(value)\n",
	"\n",
	" # append to pages\n",
	" elif isinstance(value, list):\n",
	" new[\"pages\"] += value\n",
	" output.append(new)\n",
	"\n",
	" return output"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6c6ffd2b",
	"metadata": {},
	"outputs": [],
	"source": [
	"tree = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))\n",
	"\n",
	"for f in cat:\n",
	" p = f[\"primary\"]\n",
	" s = f[\"secondary\"]\n",
	" t = f[\"tertiary\"]\n",
	" fp = f[\"current_filepath\"]\n",
	"\n",
	" tree[p][s][t].append(fp)\n",
	"\n",
	"mint = build_mint_json(tree)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "py3-default",
	"language": "python",
	"name": "pyenv_py3-default"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}