Skip to content

Instantly share code, notes, and snippets.

@swyxio
Last active October 18, 2022 05:05
Show Gist options
  • Save swyxio/6cce0e05d32610c3595d5480d7af695d to your computer and use it in GitHub Desktop.
Save swyxio/6cce0e05d32610c3595d5480d7af695d to your computer and use it in GitHub Desktop.
web scraping + gpt3. given a company name we scrape google for relevant urls and then scrape those urls for info. persisting each step in case google blocks, so we can switch IP and carry on. once we accumulated our corpus, feed into openai to generate company categories and descriptions.
# https://beta.openai.com/docs/libraries
import os
import openai
import yaml
# Load your API key from an environment variable or secret management service
openai.api_key = os.getenv("OPENAI_API_KEY")
# open res.yaml and load it into the data variable
with open('res.yaml') as f:
sources = yaml.load(f, Loader=yaml.FullLoader)
cats = """
The following are valid categories:
- Advertising
- Analytics
- AWS
- CRM
- Collaboration
- Communication
- Content
- Customer Success
- Data Lake
- Data Warehouse
- Databases
- Dev Tools
- E-Commerce
- ERP
- Email
- Event Streaming
- Files
- Finance
- Google Cloud
- Human Resources
- Marketing Automation
- Marketplace
- Microsoft Azure
- Payments
- Recruiting
- Search
- Security
- Survey
"""
def getOpenAIResponses(data):
prompt = f"""The following is a web scrape of all content relevant to for {data['name']}:
---
Their Google description says: {data['google_description']}
---
Their about page says: {data['about_content']}
---
Their website says: {data['www_url']}
---
"""
instructions = f"""
Please reply with a comma separated list of up to 3 valid categories for this company specifically only chosen from the list above.
Do not use any other categories not specifilly listed above.
Examples
Name: Adjust.com
Categories: Advertising, Analytics, Marketing Automation
Name: AlloyDB for PostgreSQL
Categories: Databases, Dev Tools, Security
Name: {data['name']}
Categories: """
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + cats + instructions, temperature=0.5, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
# print(response)
data['ai_categories'] = response['choices'][0]['text'].strip().replace('\n', ' ')
instructions = f"""Please reply with a non-numbered, comma-separated list of five different search terms for this company. Do not use the company name.
Examples
Name: Airtable
Terms: custom applications, no-code apps, online spreadsheets, workflow management, business transformation
Name: Apify Dataset
Terms: data scraping, web scraping, automation, data extraction, web crawler
Name: {data['name']}
Terms:"""
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
# print(instructions)
# print(response)
data['ai_searchterms'] = response['choices'][0]['text'].strip().replace('\n', ' ')
instructions = f"""Please reply with an exciting, dynamic and engaging marketing Headline describing {data['name']} in less than 6 words.
Do not use its name in this headline.
Examples
Name: AppFollow
Headline: Insights to Help Your Mobile App Thrive
Name: BigCommerce
Headline: The Most Trusted Commerce Solution Provider
Name: {data['name']}
Headline: """
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
# print(instructions)
# print(response)
data['ai_headline'] = response['choices'][0]['text'].strip().replace('\n', ' ')
# if the headline starts with the name, remove the name
if data['ai_headline'].startswith(data['name']):
data['ai_headline'] = data['ai_headline'].replace(data['name'] + ": ", '').strip()
instructions = f"""
Please reply with a detailed and technical medium-length Description for {data['name']} in less than 40 words.
Describe the names of its main products and what it does, writing for a technical data engineer audience.
Begin.
{data['name']} is """
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=256, top_p=1, frequency_penalty=1, presence_penalty=1)
# print(instructions)
# print(response)
data['ai_description'] = response['choices'][0]['text'].strip().replace('\n', ' ')
instructions = f"Please reply with a detailed and technically in-depth description listing each and every products, features, use cases, and origin story of {data['name']} specifically for the data engineering audience in under 200 words.\n\nBegin.\n\n"
response = openai.Completion.create(model="text-davinci-002", prompt=prompt + instructions, temperature=0, max_tokens=1000, top_p=1, frequency_penalty=1, presence_penalty=1)
# print(instructions)
# print(response)
data['ai_long_description'] = response['choices'][0]['text'].strip().replace('\n', ' ')
import csv
for i, data in enumerate(sources[0:30]):
print(f"Processing {i} of {len(sources)}: " + data['name'])
getOpenAIResponses(data)
# # delete the content field in data
# del data['www_content']
# del data['about_content']
# # dump all sources to an xlsx file
# with open('res.xlsx', 'w') as f:
# w = csv.DictWriter(f, sources[0].keys())
# w.writeheader()
# w.writerows(sources)
# save all columns in sources to csv
with open('res.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(sources[0].keys())
for data in sources[0:30]:
writer.writerow(data.values())
# wait for a random amount of time
import time
import random
from pprint import pprint
# import foo.yaml and parse it
import yaml
with open('source.yaml') as f:
data = yaml.load(f, Loader=yaml.FullLoader)
# print length of data
print('loaded data of size: ' + str(len(data)))
# loop through the data and google each name to find the url
import requests
from bs4 import BeautifulSoup
from openaiscript import getOpenAIResponses
# a function to get the url from google given the name of a company
def get_url(name, extra=""):
url = 'https://www.google.com/search?q=' + name + extra
# convert string to url
url = url.replace(" ", "+")
# get url with google chrome user agent
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'})
soup = BeautifulSoup(page.content, 'html.parser')
# # print separator
# print('---')
# print('---')
# print('---')
# # print the page source
# print(soup.prettify())
# print('---')
# print('---')
# print('---')
# get every link from the page
links = soup.find_all('a')
# pprint(links)
# get the first result from google page
result = soup.find('div', class_='yuRUbf')
# pprint(url)
# pprint(result)
# get the url from the result and remove any hashtags
link = result.find('a')['href'].split('#')[0]
# if the link equals the dontmatch url then get the next result from google page
while link == name:
result = result.find_next('div', class_='yuRUbf')
link = result.find('a')['href'].split('#')[0]
return link
def siteContent(url):
# retrieve all the words from the website except from the footer
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
words = ""
# loop through each section of the website and for each get the headings and then the text
for section in soup.find_all('section'):
# get text of all headings in this section
heading = section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# convert list to string
heading = ' '.join([str(elem.get_text()) for elem in heading])
# get the text of the section
text = section.find_all('p')
# convert list to string
text = ' '.join([str(elem.get_text()) for elem in text])
# add the heading and text to the data
words += heading + " " + text
# strip newlines
words = words.replace("\n", " ")
return words
def extractGoogleDescription(url):
# google the name and append
# https://www.google.com/search?q=what+is+www.adjust.com%3F
url = 'https://www.google.com/search?q=what+is+' + url + "%3F"
# get url with google chrome user agent
page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'})
soup = BeautifulSoup(page.content, 'html.parser')
# get the data-attrid with "wa:/description" which contains the description
result = soup.find('div', attrs={'data-attrid': 'wa:/description'})
# if there is no description, get the first result
if result is None:
result = soup.find('div', class_='yuRUbf')
# get the description of that first result
result = result.find('div', class_='BNeawe s3v9rd AP7Wnd')
return result.get_text() if result is not None else "No description available"
####################################
# get the data
####################################
start = 22 # note that we last got to index 21, start at 22 next
end = start + 20
# loop with index
for i, x in enumerate(data[start:end]):
print(str(i) + ': getting data for ' + x['name'])
time.sleep(random.randint(1, 5))
x["www_url"] = get_url(x["name"], " company website")
time.sleep(random.randint(1, 5))
x["about_url"] = get_url(x["www_url"], " about us page for the company")
time.sleep(random.randint(1, 5))
x["docs_url"] = get_url(x["www_url"], " API docs")
x["www_content"] = siteContent(x["www_url"])
x["about_content"] = siteContent(x["about_url"])
time.sleep(random.randint(1, 5))
x["google_description"] = extractGoogleDescription(x["www_url"])
# print separator
# getOpenAIResponses(x)
print('------------------------------------')
# save the data to a file
with open('res.yaml', 'w') as f:
yaml.dump(data[:i+1], f)
# # pretty print the array
# pprint(data[0])
# # save the data to a file
# with open('res.yaml', 'w') as f:
# yaml.dump(data, f)
# # save data to csv
# import csv
# with open('res.csv', 'w', newline='') as csvfile:
# # get list of fieldnames from the first item
# fieldnames = list(data[0].keys())
# writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# writer.writeheader()
# for x in data[:30]:
# writer.writerow(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment