Skip to content

Instantly share code, notes, and snippets.

@lobstrio lobstrio/amazon_extract.py
Last active Jul 11, 2018

Embed
What would you like to do?
#!/usr/bin/python3
# coding: utf-8
import requests
from bs4 import BeautifulSoup
from scrapy import Selector
import csv
def extract(url):
"""
Export all ASIN/Name from an Amazon Web Page
Arguments:
url (str):
url of the aimed Amazon Web Page
Return:
.csv file
"""
# INITIALISATION
r = requests.session()
start = datetime.datetime.now()
# COLLECTE DU CODE SOURCE
response = r.get(url=url)
print("-- STATUS CODE --")
print(response.status_code)
# PARSING ET CREATION DU CSV
with open("/path/to/amazon/asin.csv", "w") as f:
fieldnames = ['Name', 'ASIN']
writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
soup = BeautifulSoup(response.text, features='html.parser')
sel = Selector(text=soup.prettify())
products = sel.xpath("//li[contains(@id, 'result_')]").extract()
for product in products:
soup = BeautifulSoup(product, features='html.parser')
sel = Selector(text=soup.prettify())
name = sel.xpath("//a/@title").extract_first()
asin = sel.xpath("//li/@data-asin").extract_first()
print(name, asin)
values = [name, asin]
dict_row = dict(zip(fieldnames, values))
writer.writerow(dict_row)
# TEMPS PASSE
end = datetime.datetime.now()
time_elapsed = str(end-start)
print('\n')
print('-- TIME ELAPSED --')
print(time_elapsed)
def main():
url = "https://www.amazon.fr/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A405322%2Cp_n_feature_sixteen_brows" \
"e-bin%3A5704718031%2Cp_76%3A183940031&bbn=405322&ie=UTF8&qid=1531321291"
extract(url)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.