Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Created December 20, 2018 13:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save lobstrio/22d2d15723df7c478377d12376f60902 to your computer and use it in GitHub Desktop.
Save lobstrio/22d2d15723df7c478377d12376f60902 to your computer and use it in GitHub Desktop.
Web Scraping Python Script for the Xmas Deals on Amazon using Requests
# -*- coding: utf-8 -*-
# Copyright(C) 2018 Sasha Bouloudnine
import requests
import sys
import re
import ast
import json
import time
import csv
class AmazonXmasExtract:
def __init__(self, node, write=None):
self.node = node
self.write = write
self.s = requests.Session()
self.s.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'
def extract(self):
# extract home_page
base_url = 'https://www.amazon.fr/b/ref=sd_allcat_xmasgno?ie=UTF8&node={}'.format(self.node)
response = self.s.get(base_url)
if response.status_code == 200:
pass
else:
print(response.text)
sys.exit()
# extract list_of_ids
pattern = r'(?<="sortedDealIDs"\s:\s)[^\]]+\]'
list_offers_id = ast.literal_eval(re.findall(pattern, response.text)[0])
print('LIST OFFER ID: {}'.format(list_offers_id))
assert isinstance(list_offers_id, list)
# extract amazon marketplace id
pattern = r'(?<=ue_mid = \')\w+'
mid = re.findall(pattern, response.text)[0]
print('MID: {}'.format(mid))
# extract session_id
session_id = self.s.cookies.values()[0]
print('SESSION ID: {}'.format(session_id))
# build json for post request
result_list = []
sublists = [list_offers_id[i:i+5] for i in range(0, len(list_offers_id), 5)]
for sublist in sublists:
list_for_json = []
for deal_id in sublist:
print(deal_id)
deal_id_dict = {'dealID': deal_id}
list_for_json.append(deal_id_dict)
json_to_post = {
'requestMetadata':
{
'marketplaceID': mid,
'clientID': 'goldbox_mobile_pc',
'sessionID': session_id
},
'dealTargets': list_for_json,
'responseSize': 'STATUS_ONLY',
'itemResponseSize': 'DEFAULT_WITH_PREEMPTIVE_LEAKING'
}
json_to_post = json.dumps(json_to_post)
# post request
nodecache_value = str(time.time()).replace('.', '')[:13]
url = 'https://www.amazon.fr/xa/dealcontent/v2/GetDeals?nocache={}'.format(nodecache_value)
response = json.loads(self.s.post(url=url, data=json_to_post).text)
time.sleep(1) # cool pause
for key in response['dealDetails'].keys():
deal_details_dict = response['dealDetails'][key]
result_list.append(deal_details_dict)
# write csv
if self.write:
with open('/Users/sashabouloudnine/Desktop/amazon_deals.csv', mode='w') as f:
writer = csv.DictWriter(f, delimiter='\t', fieldnames=result_list[0].keys())
writer.writeheader()
for deal_details_dict in result_list:
writer.writerow(deal_details_dict)
if __name__ == '__main__':
my = AmazonXmasExtract(node='2155235031', write=True)
my.extract()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment