Skip to content

Instantly share code, notes, and snippets.

@aleksandr-dzhumurat
Created August 2, 2023 08:09
Show Gist options
  • Save aleksandr-dzhumurat/3b4fdcdca8a0871ebd0c7a2338db93df to your computer and use it in GitHub Desktop.
Save aleksandr-dzhumurat/3b4fdcdca8a0871ebd0c7a2338db93df to your computer and use it in GitHub Desktop.
Make scraping easy
import requests
import backoff
import pandas as pdimport os
import openai # for OpenAI API calls
import json
openai.api_key = os.getenv("OPENAI_API_KEY")
def validate_gpt_response(gpt_raw_resp: str) -> dict:
gpt_raw_resp = gpt_raw_resp.replace('\n', '')
resp = {
'event_name': None, 'event_link': None,'location': None,
'description': None, 'time_start': None, 'raw_gpt_response': None,
'raw_gpt_response': gpt_raw_resp
}
try:
resp_gpt = json.loads(gpt_raw_resp)
resp.update(resp_gpt)
except json.JSONDecodeError:
pass
return resp
def promt_generation(raw_html: str) -> str:
promt = f"""
I a HTML code snippet. The snippet contains activity description.
Extract event description from HTML in JSON format
resulted JSON should contain fields: event name, link to event, location, description, time start, time end
HTML: {raw_html}
JSON:
"""
return promt
@backoff.on_exception(backoff.expo, openai.error.ServiceUnavailableError)
@backoff.on_exception(backoff.expo, openai.error.APIError)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
@backoff.on_exception(backoff.expo, openai.error.Timeout)
def gpt_query(gpt_promt, verbose: bool = False):
if verbose:
print(current_promt)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
max_tokens=1000,
temperature=0.0,
top_p=0.5,
frequency_penalty=0.5,
messages=[
{
"role": "system",
"content": "You are a HTML parser.",
},
{
"role": "user",
"content": gpt_promt,
},
],
)
gpt_responce_raw = response.get("choices")[0].get("message").get("content").replace('\n', '')
res = {'gpt_resp': validate_gpt_responce(gpt_responce_raw)}
res.update(response.get("usage").to_dict())
return res
from bs4 import BeautifulSoup
def get_iterable_from_url(url):
result_df = pd.DataFrame([])
resp = requests.get(url)
dummy_scraper = BeautifulSoup(markup=resp.content, features="html.parser")
if 'eater.com/' in url:
page_blocks = dummy_scraper.find_all(name='section', class_='c-mapstack__card')
elif 'everout.com/' in url:
page_blocks = dummy_scraper.find_all(name='div', class_='event-schedules')
else:
logger.error('Valid scraper not found')
return result_df
page_description = []
for i in page_blocks:
print('sending to OpenAI %d...' % len(i.text))
current_promt = promt_generation(i.text)
gpt_resp = gpt_query(current_promt)
gpt_resp['gpt_resp'].update({'source_url': url, 'raw_html': str(i)})
page_description.append(gpt_resp['gpt_resp'])
result_df = pd.json_normalize(page_description)
return result_df
# """romantic dinner in new york"""
#result_df = get_iterable_from_url('https://ny.eater.com/maps/best-romantic-restaurants-date-night-nyc')
# gg = get_iterable_from_url('https://ny.eater.com/maps/best-romantic-restaurants-date-night-nyc')
result_df = get_iterable_from_url('https://everout.com/portland/events/?category=live-music-world-latin')
result_df.head(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment