Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ahmedshahriar/52d09af6596da034945a295ef78e87aa to your computer and use it in GitHub Desktop.
Save ahmedshahriar/52d09af6596da034945a295ef78e87aa to your computer and use it in GitHub Desktop.
import requests
import re
headers = {
'authority': 'www.amazon.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'dnt': '1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'document',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
url = 'https://www.amazon.com/dp/B084JCXSL6/'
r = requests.get(url, headers=headers)
print(r.text)
price_url = 'https://www.amazon.com/SAMSUNG-HW-T450-2-1ch-Soundbar-Dolby/dp/B085WTFCQ7/ref=pd_sim_504_1/133-1655367-3645832?_encoding=UTF8&pd_rd_i=B085WTFCQ7&pd_rd_r=7c12c2fb-a51f-4bbf-a230-226f45de404b&pd_rd_w=Ruwb1&pd_rd_wg=OdXTz&pf_rd_p=8958999c-906e-4b6b-80aa-4bc1f740ed92&pf_rd_r=MQVKGZR63A4ZTFC1YR9B&psc=1&refRID=MQVKGZR63A4ZTFC1YR9B'
""" title search """
print(re.findall('<span.*id="productTitle".*', r.text))
title_search = re.compile('<span.*id="productTitle".*class="a-size-large.*product-title-word-break">(.*?)</span>', re.DOTALL).search(r.text)
print(title_search.group(1).strip())
"""
price search
<span id="priceblock_ourprice" class="a-size-medium a-color-price priceBlockBuyingPriceString">$167.99</span>
"""
price_search = re.compile('<span.*id="priceblock_ourprice".*class="a-size-medium.*a-color-price.*priceBlockBuyingPriceString">(.*?)</span>', re.DOTALL).search(requests.get(price_url, headers=headers).text)
print(price_search.group(1).strip())
"""canonical url search
<link rel="canonical" href="https://www.amazon.com/SAMSUNG-50-inch-Crystal-TU-8000-Built/dp/B084JCXSL6">
"""
canonical_url_search = re.compile('rel="canonical".*href=(.*?)>', re.DOTALL).search(r.text)
print(canonical_url_search.group(1).strip())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment