Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save lobstrio/02d293387ef7fc2d096bd109cba344c4 to your computer and use it in GitHub Desktop.
Save lobstrio/02d293387ef7fc2d096bd109cba344c4 to your computer and use it in GitHub Desktop.
Scrape price and title from *any* product page, with Python and ChatGPT 🤖
import os
import requests
import html2text
import re
import argparse
OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'
PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:
article_title
article_url
article_price
%s"""
MAX_GPT_WORDS = 2000
class pricingPagesGPTScraper:
def __init__(self):
self.s = requests.Session()
def get_html(self, url):
assert url and isinstance(url, str)
print('[get_html]\n%s' % url)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'max-age=0',
'sec-ch-device-memory': '8',
'sec-ch-dpr': '2',
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-ch-ua-platform-version': '"12.5.0"',
'sec-ch-viewport-width': '1469',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
'viewport-width': '1469',
}
self.s.headers = headers
r = self.s.get(url)
assert r.status_code == 200
html = r.text
return html
def convert_html_to_text(self, html):
assert html
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
text = h.handle(html)
assert text
return text
def reduce_text_size(self, text):
print('Starting text size: %s' % len(text))
assert text
words = re.findall(r'\w+', text)
if len(words) > MAX_GPT_WORDS:
initial_characters = len(text)
size_ratio = len(words)/MAX_GPT_WORDS
print('/!\\ text too large! size being divided by %s' % size_ratio)
max_characters = int(initial_characters//size_ratio)
text = text[:max_characters]
print('Ending text size: %s' % len(text))
return text
def fill_prompt(self, text):
assert text
prompt = PROMPT % text
return prompt
# @retry(AssertionError, tries=3, delay=2)
def get_gpt(self, prompt):
headers = {
'Authorization': 'Bearer %s' % OPENAI_API_KEY,
}
json_data = {
'model': 'gpt-3.5-turbo',
'messages': [
{
"role": "user",
"content": prompt
}
],
'temperature': 0.7
}
response = requests.post(COMPLETION_URL, headers=headers, json=json_data)
assert response.status_code == 200
content = response.json()["choices"][0]["message"]["content"]
return content
def main(self, url):
assert url
html = self.get_html(url)
text = self.convert_html_to_text(html)
text = self.reduce_text_size(text)
prompt = self.fill_prompt(text)
answer = self.get_gpt(prompt)
return answer
def main():
argparser = argparse.ArgumentParser()
argparser.add_argument('--url', '-u', type=str, required=False, help='product page url to be scraped', default='https://www.amazon.com/dp/B09723XSVM')
args = argparser.parse_args()
url = args.url
assert url
pp = pricingPagesGPTScraper()
answer = pp.main(url)
print(answer)
print('''~~ success
_ _ _
| | | | | |
| | ___ | |__ ___| |_ __ __
| |/ _ \| '_ \/ __| __/| '__|
| | (_) | |_) \__ \ |_ | |
|_|\___/|_.__/|___/\__||_|
''')
if __name__ == '__main__':
main()
@seb2668
Copy link

seb2668 commented Jun 6, 2023

bravo très beau boulot

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment