cnych/craw.py

## craw.py
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

domain = "docs.youdianzhishi.com" # <- put your domain to be crawled
full_url = "https://docs.youdianzhishi.com/" # <- put your domain to be crawled with https or http

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):

    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []

            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

            # Get the text from the URL using BeautifulSoup
            r = requests.get(url)
            r.encoding = 'utf-8'
            soup = BeautifulSoup(r.text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")

            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)

## process.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_newlines(serie):\n",
    "    serie = serie.str.replace('\\n', ' ')\n",
    "    serie = serie.str.replace('\\\\n', ' ')\n",
    "    serie = serie.str.replace('  ', ' ')\n",
    "    serie = serie.str.replace('  ', ' ')\n",
    "    return serie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "domain = \"docs.youdianzhishi.com\"\n",
    "full_url = \"https://docs.youdianzhishi.com/\"\n",
    "\n",
    "# Create a list to store the text files\n",
    "texts=[]\n",
    "\n",
    "# Get all the text files in the text directory\n",
    "for file in os.listdir(\"text/\" + domain + \"/\"):\n",
    "\n",
    "    # Open the file and read the text\n",
    "    with open(\"text/\" + domain + \"/\" + file, \"r\", encoding=\"UTF-8\") as f:\n",
    "        text = f.read()\n",
    "\n",
    "        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.\n",
    "        texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))\n",
    "\n",
    "# Create a dataframe from the list of texts\n",
    "df = pd.DataFrame(texts, columns = ['fname', 'text'])\n",
    "\n",
    "# Set the text column to be the raw text with the newlines removed\n",
    "df['text'] = df.fname + \". \" + remove_newlines(df.text)\n",
    "df.to_csv('processed/scraped.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install tiktoken matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tiktoken\n",
    "import pandas as pd\n",
    "\n",
    "# Load the cl100k_base tokenizer which is designed to work with the ada-002 model\n",
    "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
    "\n",
    "df = pd.read_csv('processed/scraped.csv', index_col=0)\n",
    "df.columns = ['title', 'text']\n",
    "\n",
    "# Tokenize the text and save the number of tokens to a new column\n",
    "df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
    "\n",
    "# Visualize the distribution of the number of tokens per row using a histogram\n",
    "df.n_tokens.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_tokens = 500\n",
    "\n",
    "# Function to split the text into chunks of a maximum number of tokens\n",
    "def split_into_many(text, max_tokens = max_tokens):\n",
    "\n",
    "    # Split the text into sentences\n",
    "    sentences = text.split('. ')\n",
    "\n",
    "    # Get the number of tokens for each sentence\n",
    "    n_tokens = [len(tokenizer.encode(\" \" + sentence)) for sentence in sentences]\n",
    "    \n",
    "    chunks = []\n",
    "    tokens_so_far = 0\n",
    "    chunk = []\n",
    "\n",
    "    # Loop through the sentences and tokens joined together in a tuple\n",
    "    for sentence, token in zip(sentences, n_tokens):\n",
    "\n",
    "        # If the number of tokens so far plus the number of tokens in the current sentence is greater \n",
    "        # than the max number of tokens, then add the chunk to the list of chunks and reset\n",
    "        # the chunk and tokens so far\n",
    "        if tokens_so_far + token > max_tokens:\n",
    "            chunks.append(\". \".join(chunk) + \".\")\n",
    "            chunk = []\n",
    "            tokens_so_far = 0\n",
    "\n",
    "        # If the number of tokens in the current sentence is greater than the max number of \n",
    "        # tokens, go to the next sentence\n",
    "        if token > max_tokens:\n",
    "            continue\n",
    "\n",
    "        # Otherwise, add the sentence to the chunk and add the number of tokens to the total\n",
    "        chunk.append(sentence)\n",
    "        tokens_so_far += token + 1\n",
    "\n",
    "    return chunks\n",
    "    \n",
    "\n",
    "shortened = []\n",
    "\n",
    "# Loop through the dataframe\n",
    "for row in df.iterrows():\n",
    "\n",
    "    # If the text is None, go to the next row\n",
    "    if row[1]['text'] is None:\n",
    "        continue\n",
    "\n",
    "    # If the number of tokens is greater than the max number of tokens, split the text into chunks\n",
    "    if row[1]['n_tokens'] > max_tokens:\n",
    "        shortened += split_into_many(row[1]['text'])\n",
    "    \n",
    "    # Otherwise, add the text to the list of shortened texts\n",
    "    else:\n",
    "        shortened.append( row[1]['text'] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(shortened, columns = ['text'])\n",
    "df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
    "df.n_tokens.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
    "df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])\n",
    "\n",
    "df.to_csv('processed/embeddings.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "df=pd.read_csv('processed/embeddings.csv', index_col=0)\n",
    "df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install plotly scipy scikit-learn -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai.embeddings_utils import distances_from_embeddings\n",
    "\n",
    "def create_context(question, df, max_len=1800):\n",
    "    \"\"\"\n",
    "    Create a context for a question by finding the most similar context from the dataframe\n",
    "    \"\"\"\n",
    "\n",
    "    # Get the embeddings for the question\n",
    "    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']\n",
    "\n",
    "    # Get the distances from the embeddings\n",
    "    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')\n",
    "\n",
    "\n",
    "    returns = []\n",
    "    cur_len = 0\n",
    "\n",
    "    # Sort by distance and add the text to the context until the context is too long\n",
    "    for i, row in df.sort_values('distances', ascending=True).iterrows():\n",
    "        \n",
    "        # Add the length of the text to the current length\n",
    "        cur_len += row['n_tokens'] + 4\n",
    "        \n",
    "        # If the context is too long, break\n",
    "        if cur_len > max_len:\n",
    "            break\n",
    "        \n",
    "        # Else add it to the text that is being returned\n",
    "        returns.append(row[\"text\"])\n",
    "\n",
    "    # Return the context\n",
    "    return \"\\n\\n###\\n\\n\".join(returns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df = pd.read_csv('processed/embeddings.csv', index_col=0)\n",
    "df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
    "def answer_question(\n",
    "    df,\n",
    "    question=\"Kubernetes是什么？\",\n",
    "    max_len=1800,\n",
    "    debug=False,\n",
    "    max_tokens=150,\n",
    "    stop_sequence=None\n",
    "):\n",
    "    \"\"\"\n",
    "    根据 dataframe texts 中最相似的上下文回答问题\n",
    "    \"\"\"\n",
    "    context = create_context(\n",
    "        question,\n",
    "        df,\n",
    "        max_len=max_len,\n",
    "    )\n",
    "    if debug:\n",
    "        print(\"Context:\\n\" + context)\n",
    "        print(\"\\n\\n\")\n",
    "\n",
    "    try:\n",
    "        response = openai.ChatCompletion.create(\n",
    "            model=\"gpt-3.5-turbo\",\n",
    "            temperature=0,\n",
    "            max_tokens=max_tokens,\n",
    "            top_p=1,\n",
    "            frequency_penalty=0,\n",
    "            presence_penalty=0,\n",
    "            stop=stop_sequence,\n",
    "            messages=[\n",
    "                {\"role\": \"system\", \"content\": \"You are youdianzhishi's official knowledge base AI robot assistant\"},\n",
    "                {\"role\": \"user\", \"content\": f\"Answer the question based on the context below with Chinese, and if the question can't be answered based on the context, say \\\"I don't know\\\"\\n\\nContext: {context}\\n\\n---\\n\\nQuestion: {question}\\nAnswer:\"},\n",
    "            ]\n",
    "        )\n",
    "        result = ''\n",
    "        for choice in response.choices:\n",
    "            result += choice.message.content\n",
    "        return result\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        return \"\"\n",
    "\n",
    "answer_question(df, question=\"如何使用Prometheus Operator?\", debug=True)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gpt4",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	import requests
	import re
	import urllib.request
	from bs4 import BeautifulSoup
	from collections import deque
	from html.parser import HTMLParser
	from urllib.parse import urlparse
	import os

	# Regex pattern to match a URL
	HTTP_URL_PATTERN = r'^http[s]*://.+'

	domain = "docs.youdianzhishi.com" # <- put your domain to be crawled
	full_url = "https://docs.youdianzhishi.com/" # <- put your domain to be crawled with https or http

	# Create a class to parse the HTML and get the hyperlinks
	class HyperlinkParser(HTMLParser):
	def __init__(self):
	super().__init__()
	# Create a list to store the hyperlinks
	self.hyperlinks = []

	# Override the HTMLParser's handle_starttag method to get the hyperlinks
	def handle_starttag(self, tag, attrs):
	attrs = dict(attrs)

	# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
	if tag == "a" and "href" in attrs:
	self.hyperlinks.append(attrs["href"])

	# Function to get the hyperlinks from a URL
	def get_hyperlinks(url):

	# Try to open the URL and read the HTML
	try:
	# Open the URL and read the HTML
	with urllib.request.urlopen(url) as response:

	# If the response is not HTML, return an empty list
	if not response.info().get('Content-Type').startswith("text/html"):
	return []

	# Decode the HTML
	html = response.read().decode('utf-8')
	except Exception as e:
	print(e)
	return []

	# Create the HTML Parser and then Parse the HTML to get hyperlinks
	parser = HyperlinkParser()
	parser.feed(html)

	return parser.hyperlinks

	# Function to get the hyperlinks from a URL that are within the same domain
	def get_domain_hyperlinks(local_domain, url):
	clean_links = []
	for link in set(get_hyperlinks(url)):
	clean_link = None

	# If the link is a URL, check if it is within the same domain
	if re.search(HTTP_URL_PATTERN, link):
	# Parse the URL and check if the domain is the same
	url_obj = urlparse(link)
	if url_obj.netloc == local_domain:
	clean_link = link

	# If the link is not a URL, check if it is a relative link
	else:
	if link.startswith("/"):
	link = link[1:]
	elif link.startswith("#") or link.startswith("mailto:"):
	continue
	clean_link = "https://" + local_domain + "/" + link

	if clean_link is not None:
	if clean_link.endswith("/"):
	clean_link = clean_link[:-1]
	clean_links.append(clean_link)

	# Return the list of hyperlinks that are within the same domain
	return list(set(clean_links))

	def crawl(url):
	# Parse the URL and get the domain
	local_domain = urlparse(url).netloc

	# Create a queue to store the URLs to crawl
	queue = deque([url])

	# Create a set to store the URLs that have already been seen (no duplicates)
	seen = set([url])

	# Create a directory to store the text files
	if not os.path.exists("text/"):
	os.mkdir("text/")

	if not os.path.exists("text/"+local_domain+"/"):
	os.mkdir("text/" + local_domain + "/")

	# Create a directory to store the csv files
	if not os.path.exists("processed"):
	os.mkdir("processed")

	# While the queue is not empty, continue crawling
	while queue:

	# Get the next URL from the queue
	url = queue.pop()
	print(url) # for debugging and to see the progress

	# Save text from the url to a <url>.txt file
	with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

	# Get the text from the URL using BeautifulSoup
	r = requests.get(url)
	r.encoding = 'utf-8'
	soup = BeautifulSoup(r.text, "html.parser")

	# Get the text but remove the tags
	text = soup.get_text()

	# If the crawler gets to a page that requires JavaScript, it will stop the crawl
	if ("You need to enable JavaScript to run this app." in text):
	print("Unable to parse page " + url + " due to JavaScript being required")

	# Otherwise, write the text to the file in the text directory
	f.write(text)

	# Get the hyperlinks from the URL and add them to the queue
	for link in get_domain_hyperlinks(local_domain, url):
	if link not in seen:
	queue.append(link)
	seen.add(link)

	crawl(full_url)
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"def remove_newlines(serie):\n",
	" serie = serie.str.replace('\\n', ' ')\n",
	" serie = serie.str.replace('\\\\n', ' ')\n",
	" serie = serie.str.replace(' ', ' ')\n",
	" serie = serie.str.replace(' ', ' ')\n",
	" return serie"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import pandas as pd\n",
	"\n",
	"domain = \"docs.youdianzhishi.com\"\n",
	"full_url = \"https://docs.youdianzhishi.com/\"\n",
	"\n",
	"# Create a list to store the text files\n",
	"texts=[]\n",
	"\n",
	"# Get all the text files in the text directory\n",
	"for file in os.listdir(\"text/\" + domain + \"/\"):\n",
	"\n",
	" # Open the file and read the text\n",
	" with open(\"text/\" + domain + \"/\" + file, \"r\", encoding=\"UTF-8\") as f:\n",
	" text = f.read()\n",
	"\n",
	" # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.\n",
	" texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))\n",
	"\n",
	"# Create a dataframe from the list of texts\n",
	"df = pd.DataFrame(texts, columns = ['fname', 'text'])\n",
	"\n",
	"# Set the text column to be the raw text with the newlines removed\n",
	"df['text'] = df.fname + \". \" + remove_newlines(df.text)\n",
	"df.to_csv('processed/scraped.csv')\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%pip install tiktoken matplotlib"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import tiktoken\n",
	"import pandas as pd\n",
	"\n",
	"# Load the cl100k_base tokenizer which is designed to work with the ada-002 model\n",
	"tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
	"\n",
	"df = pd.read_csv('processed/scraped.csv', index_col=0)\n",
	"df.columns = ['title', 'text']\n",
	"\n",
	"# Tokenize the text and save the number of tokens to a new column\n",
	"df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
	"\n",
	"# Visualize the distribution of the number of tokens per row using a histogram\n",
	"df.n_tokens.hist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"max_tokens = 500\n",
	"\n",
	"# Function to split the text into chunks of a maximum number of tokens\n",
	"def split_into_many(text, max_tokens = max_tokens):\n",
	"\n",
	" # Split the text into sentences\n",
	" sentences = text.split('. ')\n",
	"\n",
	" # Get the number of tokens for each sentence\n",
	" n_tokens = [len(tokenizer.encode(\" \" + sentence)) for sentence in sentences]\n",
	" \n",
	" chunks = []\n",
	" tokens_so_far = 0\n",
	" chunk = []\n",
	"\n",
	" # Loop through the sentences and tokens joined together in a tuple\n",
	" for sentence, token in zip(sentences, n_tokens):\n",
	"\n",
	" # If the number of tokens so far plus the number of tokens in the current sentence is greater \n",
	" # than the max number of tokens, then add the chunk to the list of chunks and reset\n",
	" # the chunk and tokens so far\n",
	" if tokens_so_far + token > max_tokens:\n",
	" chunks.append(\". \".join(chunk) + \".\")\n",
	" chunk = []\n",
	" tokens_so_far = 0\n",
	"\n",
	" # If the number of tokens in the current sentence is greater than the max number of \n",
	" # tokens, go to the next sentence\n",
	" if token > max_tokens:\n",
	" continue\n",
	"\n",
	" # Otherwise, add the sentence to the chunk and add the number of tokens to the total\n",
	" chunk.append(sentence)\n",
	" tokens_so_far += token + 1\n",
	"\n",
	" return chunks\n",
	" \n",
	"\n",
	"shortened = []\n",
	"\n",
	"# Loop through the dataframe\n",
	"for row in df.iterrows():\n",
	"\n",
	" # If the text is None, go to the next row\n",
	" if row[1]['text'] is None:\n",
	" continue\n",
	"\n",
	" # If the number of tokens is greater than the max number of tokens, split the text into chunks\n",
	" if row[1]['n_tokens'] > max_tokens:\n",
	" shortened += split_into_many(row[1]['text'])\n",
	" \n",
	" # Otherwise, add the text to the list of shortened texts\n",
	" else:\n",
	" shortened.append( row[1]['text'] )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame(shortened, columns = ['text'])\n",
	"df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
	"df.n_tokens.hist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import openai\n",
	"\n",
	"df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])\n",
	"\n",
	"df.to_csv('processed/embeddings.csv')\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%pip install numpy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"\n",
	"df=pd.read_csv('processed/embeddings.csv', index_col=0)\n",
	"df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
	"\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%pip install plotly scipy scikit-learn -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"from openai.embeddings_utils import distances_from_embeddings\n",
	"\n",
	"def create_context(question, df, max_len=1800):\n",
	" \"\"\"\n",
	" Create a context for a question by finding the most similar context from the dataframe\n",
	" \"\"\"\n",
	"\n",
	" # Get the embeddings for the question\n",
	" q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']\n",
	"\n",
	" # Get the distances from the embeddings\n",
	" df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')\n",
	"\n",
	"\n",
	" returns = []\n",
	" cur_len = 0\n",
	"\n",
	" # Sort by distance and add the text to the context until the context is too long\n",
	" for i, row in df.sort_values('distances', ascending=True).iterrows():\n",
	" \n",
	" # Add the length of the text to the current length\n",
	" cur_len += row['n_tokens'] + 4\n",
	" \n",
	" # If the context is too long, break\n",
	" if cur_len > max_len:\n",
	" break\n",
	" \n",
	" # Else add it to the text that is being returned\n",
	" returns.append(row[\"text\"])\n",
	"\n",
	" # Return the context\n",
	" return \"\\n\\n###\\n\\n\".join(returns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"df = pd.read_csv('processed/embeddings.csv', index_col=0)\n",
	"df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import openai\n",
	"\n",
	"def answer_question(\n",
	" df,\n",
	" question=\"Kubernetes是什么？\",\n",
	" max_len=1800,\n",
	" debug=False,\n",
	" max_tokens=150,\n",
	" stop_sequence=None\n",
	"):\n",
	" \"\"\"\n",
	" 根据 dataframe texts 中最相似的上下文回答问题\n",
	" \"\"\"\n",
	" context = create_context(\n",
	" question,\n",
	" df,\n",
	" max_len=max_len,\n",
	" )\n",
	" if debug:\n",
	" print(\"Context:\\n\" + context)\n",
	" print(\"\\n\\n\")\n",
	"\n",
	" try:\n",
	" response = openai.ChatCompletion.create(\n",
	" model=\"gpt-3.5-turbo\",\n",
	" temperature=0,\n",
	" max_tokens=max_tokens,\n",
	" top_p=1,\n",
	" frequency_penalty=0,\n",
	" presence_penalty=0,\n",
	" stop=stop_sequence,\n",
	" messages=[\n",
	" {\"role\": \"system\", \"content\": \"You are youdianzhishi's official knowledge base AI robot assistant\"},\n",
	" {\"role\": \"user\", \"content\": f\"Answer the question based on the context below with Chinese, and if the question can't be answered based on the context, say \\\"I don't know\\\"\\n\\nContext: {context}\\n\\n---\\n\\nQuestion: {question}\\nAnswer:\"},\n",
	" ]\n",
	" )\n",
	" result = ''\n",
	" for choice in response.choices:\n",
	" result += choice.message.content\n",
	" return result\n",
	" except Exception as e:\n",
	" print(e)\n",
	" return \"\"\n",
	"\n",
	"answer_question(df, question=\"如何使用Prometheus Operator?\", debug=True)\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "gpt4",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.16"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}