lobstrio/amazon_extract.py

## amazon_extract.py
#!/usr/bin/python3
# coding: utf-8

import requests
from bs4 import BeautifulSoup
from scrapy import Selector
import csv

def extract(url):

    """
    Export all ASIN/Name from an Amazon Web Page

    Arguments:
         url (str):
            url of the aimed Amazon Web Page

    Return:
        .csv file
    """

    # INITIALISATION
    r = requests.session()
    start = datetime.datetime.now()


    # COLLECTE DU CODE SOURCE
    response = r.get(url=url)
    print("-- STATUS CODE --")
    print(response.status_code)

    # PARSING ET CREATION DU CSV
    with open("/path/to/amazon/asin.csv", "w") as f:
        fieldnames = ['Name', 'ASIN']
        writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        soup = BeautifulSoup(response.text, features='html.parser')
        sel = Selector(text=soup.prettify())

        products = sel.xpath("//li[contains(@id, 'result_')]").extract()
        for product in products:
            soup = BeautifulSoup(product, features='html.parser')
            sel = Selector(text=soup.prettify())
            name = sel.xpath("//a/@title").extract_first()
            asin = sel.xpath("//li/@data-asin").extract_first()
            print(name, asin)
            values = [name, asin]
            dict_row = dict(zip(fieldnames, values))
            writer.writerow(dict_row)

    # TEMPS PASSE
    end = datetime.datetime.now()
    time_elapsed = str(end-start)
    print('\n')
    print('-- TIME ELAPSED --')
    print(time_elapsed)

def main():
    url = "https://www.amazon.fr/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A405322%2Cp_n_feature_sixteen_brows" \
          "e-bin%3A5704718031%2Cp_76%3A183940031&bbn=405322&ie=UTF8&qid=1531321291"
    extract(url)

if __name__ == '__main__':
    main()
	#!/usr/bin/python3
	# coding: utf-8

	import requests
	from bs4 import BeautifulSoup
	from scrapy import Selector
	import csv

	def extract(url):

	"""
	Export all ASIN/Name from an Amazon Web Page

	Arguments:
	url (str):
	url of the aimed Amazon Web Page

	Return:
	.csv file
	"""

	# INITIALISATION
	r = requests.session()
	start = datetime.datetime.now()


	# COLLECTE DU CODE SOURCE
	response = r.get(url=url)
	print("-- STATUS CODE --")
	print(response.status_code)

	# PARSING ET CREATION DU CSV
	with open("/path/to/amazon/asin.csv", "w") as f:
	fieldnames = ['Name', 'ASIN']
	writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t')
	writer.writeheader()
	soup = BeautifulSoup(response.text, features='html.parser')
	sel = Selector(text=soup.prettify())

	products = sel.xpath("//li[contains(@id, 'result_')]").extract()
	for product in products:
	soup = BeautifulSoup(product, features='html.parser')
	sel = Selector(text=soup.prettify())
	name = sel.xpath("//a/@title").extract_first()
	asin = sel.xpath("//li/@data-asin").extract_first()
	print(name, asin)
	values = [name, asin]
	dict_row = dict(zip(fieldnames, values))
	writer.writerow(dict_row)

	# TEMPS PASSE
	end = datetime.datetime.now()
	time_elapsed = str(end-start)
	print('\n')
	print('-- TIME ELAPSED --')
	print(time_elapsed)

	def main():
	url = "https://www.amazon.fr/s/ref=sr_pg_1?fst=as%3Aoff&rh=n%3A405322%2Cp_n_feature_sixteen_brows" \
	"e-bin%3A5704718031%2Cp_76%3A183940031&bbn=405322&ie=UTF8&qid=1531321291"
	extract(url)

	if __name__ == '__main__':
	main()