Created
April 28, 2023 12:48
-
-
Save lobstrio/02d293387ef7fc2d096bd109cba344c4 to your computer and use it in GitHub Desktop.
Scrape price and title from *any* product page, with Python and ChatGPT 🤖
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import html2text | |
import re | |
import argparse | |
OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY' | |
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions' | |
PROMPT = """Find the main article from this product page, and return from this text content, as JSON format: | |
article_title | |
article_url | |
article_price | |
%s""" | |
MAX_GPT_WORDS = 2000 | |
class pricingPagesGPTScraper: | |
def __init__(self): | |
self.s = requests.Session() | |
def get_html(self, url): | |
assert url and isinstance(url, str) | |
print('[get_html]\n%s' % url) | |
headers = { | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7', | |
'cache-control': 'max-age=0', | |
'sec-ch-device-memory': '8', | |
'sec-ch-dpr': '2', | |
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-ch-ua-platform-version': '"12.5.0"', | |
'sec-ch-viewport-width': '1469', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'none', | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', | |
'viewport-width': '1469', | |
} | |
self.s.headers = headers | |
r = self.s.get(url) | |
assert r.status_code == 200 | |
html = r.text | |
return html | |
def convert_html_to_text(self, html): | |
assert html | |
h = html2text.HTML2Text() | |
h.ignore_links = True | |
h.ignore_images = True | |
text = h.handle(html) | |
assert text | |
return text | |
def reduce_text_size(self, text): | |
print('Starting text size: %s' % len(text)) | |
assert text | |
words = re.findall(r'\w+', text) | |
if len(words) > MAX_GPT_WORDS: | |
initial_characters = len(text) | |
size_ratio = len(words)/MAX_GPT_WORDS | |
print('/!\\ text too large! size being divided by %s' % size_ratio) | |
max_characters = int(initial_characters//size_ratio) | |
text = text[:max_characters] | |
print('Ending text size: %s' % len(text)) | |
return text | |
def fill_prompt(self, text): | |
assert text | |
prompt = PROMPT % text | |
return prompt | |
# @retry(AssertionError, tries=3, delay=2) | |
def get_gpt(self, prompt): | |
headers = { | |
'Authorization': 'Bearer %s' % OPENAI_API_KEY, | |
} | |
json_data = { | |
'model': 'gpt-3.5-turbo', | |
'messages': [ | |
{ | |
"role": "user", | |
"content": prompt | |
} | |
], | |
'temperature': 0.7 | |
} | |
response = requests.post(COMPLETION_URL, headers=headers, json=json_data) | |
assert response.status_code == 200 | |
content = response.json()["choices"][0]["message"]["content"] | |
return content | |
def main(self, url): | |
assert url | |
html = self.get_html(url) | |
text = self.convert_html_to_text(html) | |
text = self.reduce_text_size(text) | |
prompt = self.fill_prompt(text) | |
answer = self.get_gpt(prompt) | |
return answer | |
def main(): | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('--url', '-u', type=str, required=False, help='product page url to be scraped', default='https://www.amazon.com/dp/B09723XSVM') | |
args = argparser.parse_args() | |
url = args.url | |
assert url | |
pp = pricingPagesGPTScraper() | |
answer = pp.main(url) | |
print(answer) | |
print('''~~ success | |
_ _ _ | |
| | | | | | | |
| | ___ | |__ ___| |_ __ __ | |
| |/ _ \| '_ \/ __| __/| '__| | |
| | (_) | |_) \__ \ |_ | | | |
|_|\___/|_.__/|___/\__||_| | |
''') | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
bravo très beau boulot