Skip to content

Instantly share code, notes, and snippets.

@henryamster
Created March 20, 2023 21:06
Show Gist options
  • Save henryamster/aacc2d413d0d78358fd803d8f22c510a to your computer and use it in GitHub Desktop.
Save henryamster/aacc2d413d0d78358fd803d8f22c510a to your computer and use it in GitHub Desktop.
import http.client, urllib.parse, json
import requests
from bs4 import BeautifulSoup
import openai
import nltk
# Download the NLTK Punkt tokenizer model
nltk.download('punkt')
# Set up your API keys
BING_API_KEY = os.environ['BING_SEARCH_V7_SUBSCRIPTION_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
# Configure OpenAI library
openai.api_key = OPENAI_API_KEY
def extract_full_content(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
for script in soup(["script", "style"]):
script.decompose()
full_text = ' '.join(soup.stripped_strings)
return full_text
def truncate_text(text, max_tokens=4096):
tokens = nltk.word_tokenize(text)
truncated_tokens = tokens[:max_tokens]
return ' '.join(truncated_tokens)
def extract_pertinent_information(prompt):
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=1024,
n=1,
stop=None,
temperature=0.5,
)
return response.choices[0].text.strip()
def generate_entries(prompt):
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=2048,
n=1,
stop=None,
temperature=0.5,
)
return response.choices[0].text.strip().split("\n")
term = 'dress shirt'
subscriptionKey = BING_API_KEY
host = 'api.bing.microsoft.com'
path = '/v7.0/search'
params = '?q=' + urllib.parse.quote(term) + '&count=5'
headers = {'Ocp-Apim-Subscription-Key': subscriptionKey}
conn = http.client.HTTPSConnection(host)
conn.request("GET", path + params, headers=headers)
response = conn.getresponse()
results = response.read()
results = json.loads(results)
pertinent_information_list = []
for i in range(min(len(results['webPages']['value']), 5)):
url = results['webPages']['value'][i]['url']
full_text_content = extract_full_content(url)
# Truncate text to fit within GPT-3's context window of 4096 tokens (assuming GPT-4 has the same limit)
truncated_text_content = truncate_text(full_text_content)
extraction_prompt = f"Extract only pertinent information related to crafting a perfect typescript interface for {term} from the following content: {truncated_text_content}"
pertinent_info_gpt3 = extract_pertinent_information(extraction_prompt)
pertinent_information_list.append({
"source_url": url,
"pertinent_information": pertinent_info_gpt3
})
# Generate ten entries matching the perfect JSON schema using GPT-3
pertinent_info_combined_str =''.join([f"\n{i+1}. {pertinent_information_list[i]['pertinent_information']}" for i in range(len(pertinent_information_list))])
generate_entries_prompt=f"Based on the following pertinent information related to crafting a perfect typescript interface for {term}:{pertinent_info_combined_str}\nGenerate 10 entries matching the perfect JSON schema:"
generated_entries_gpt3=generate_entries(generate_entries_prompt)
# Display the results in JSON format
output_json={
"input": term,
"pertinent_information_list": pertinent_information_list,
"generated_entries": generated_entries_gpt3
}
print(json.dumps(output_json, indent=2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment