hassanabidpk/crawling.py

## crawling.py
from bs4 import BeautifulSoup
import requests
import re
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

"""

example_url : https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dbeauty&field-keywords=lipstick&page=1

Returns total number of recommended_items for a single searched item
<Every page has 48 search results>

Installation

1- Download chromedriver_mac64.zip  from http://chromedriver.storage.googleapis.com/index.html?path=2.24/
2- Unzip and place in the same folder with code and note down the path.
3- Repace CHROME_WEBDRIVER_PATH with Path on your computer for chromedriver

How to Run  (python 3)

$ python -m venv amazonvenv
$ source amazonvenv\Scripts\activate
$ pip install requests
$ pip install beautifulsoup4
$ pip install -U selenium
$ pip install html5lib
$ python crawling.py


"""

BASE_URL = "https://www.amazon.com/s/ref=nb_sb_noss_2"
KEYWORD = "lipstick"
headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
CHROME_WEBDRIVER_PATH = '/Users/hassanabid/Documents/hassan/GDE_code/web_crawling_amazon/chromedriver'
START_PAGE = 1
END_PAGE =  2

def main():

	payload_main = {'url': "search-alias%3Dbeauty",'field-keywords': KEYWORD,"page":1}
	r_amazon = requests.get(BASE_URL, params=payload_main,headers=headers)
	soup = BeautifulSoup(r_amazon.text,'html5lib')
	count_raw = soup.select("#s-result-count")

	if count_raw:
		result = re.search(r'[\d]+.[\d]+.*[\d]+.[\d]+', count_raw[0].text)
		if result:
			total_pages_re = re.search(r'[\d]+,[\d]+',result.group())
			total_pages_count = int(total_pages_re.group().replace(",",""))/48
			print("Total_Page_Count : {}".format(int(total_pages_count)))
			for i in range(START_PAGE,END_PAGE+1):
				item_links_raw = get_next_page(i)
				items_soup,items_links = get_items_soup(total_pages_count,item_links_raw)
				find_recommended_items(items_soup,items_links,i)
	else :
		print("nothing found for {}".format(r_amazon.url))


def get_next_page(page_no):

	payload_main = {'url': "search-alias%3Dbeauty",'field-keywords': KEYWORD,"page":page_no}
	r_amazon = requests.get(BASE_URL, params=payload_main,headers=headers)
	soup = BeautifulSoup(r_amazon.text,'html5lib')
	count_raw = soup.select("#s-result-count")
	item_links_raw = soup.select("#resultsCol .a-row .a-spacing-top-mini > a")
	print("Page No : {} - len(item_links) : {}".format(page_no,len(item_links_raw)))
	return item_links_raw

def get_items_soup(total_pages_count,item_links_raw):
	items_soup = []
	valid_item_links = []
	for item in item_links_raw:
		single_item_link = item["href"]
		try:
			r_single_item = requests.get(single_item_link,headers=headers)
			soup = BeautifulSoup(r_single_item.text,"html5lib")
			items_soup.append(soup)
			print("fetched item no. {}".format(len(items_soup)))
			valid_item_links.append(single_item_link)
		except:
			print("couldn't find!")
	return items_soup,valid_item_links

def find_recommended_items(items_soup,item_links,page_no):
	key_items = 0
	titles = []
	brands = []
	prices = []
	asins = []
	recommended_items = {}
	for index, soup in enumerate(items_soup):
		main_title, brand, price, main_asin = find_title_brand_price(soup)
		if not main_title:
			# move this below and add a case when title is not found. (empty [] in dict)
			print("title not found for item :{} - link: {}".format(key_items,item_links[index]))
			continue
		else :
			key_items = index + 1
		titles.append(main_title)
		brands.append(brand)
		prices.append(price)
		asins.append(main_asin)
		recommended_items_raw = soup.select("#purchase-sims-feature .a-carousel-viewport li a")
		print("fetching recommended_items (count: {}) for {}".format(len(recommended_items_raw),key_items))
		rec_items_href = []
		rec_items_titles = []
		rec_items_brands = []
		rec_items_prices = []
		rec_items_asins = []
		recommended_items_raw_extended = initiate_webdriver(item_links[index],recommended_items_raw)
		recommended_items_raw_extended = recommended_items_raw_extended

		for item in recommended_items_raw_extended:
			try :
				# print("item : {}".format(item["href"]))
				if not "product-reviews" in item["href"]:
					rec_items_href.append(item["href"])
			except KeyError:
				print("no link found")
		print("found {} valid links in rec_items for {}".format(len(rec_items_href),key_items))
		rec_items_href = rec_items_href[0:31]
		print("fetch title, brand, price and asin for rec items")
		for single_href in rec_items_href:
			try:
				r_single_rec_item = requests.get("https://www.amazon.com{}".format(single_href),headers=headers)
				soup = BeautifulSoup(r_single_rec_item.text,"html5lib")
				title,brand,price,asin = find_title_brand_price(soup)
				if title and brand:
					if not title in rec_items_titles:
						# print("title : {}".format(len(title)))
						rec_items_titles.append(title)
						rec_items_brands.append(brand)
						rec_items_prices.append(price)
						rec_items_asins.append(asin)
			except:
				print("couldn't fetch single_href :{}".format(single_href))

		items_dict = {"titles" : rec_items_titles,"brands":rec_items_brands,"prices":rec_items_prices,"asins":rec_items_asins}
		recommended_items[main_asin] = items_dict
		print("recorded recommended items for key:{} - values:{}".format(main_asin,len(recommended_items)))

	# write to csv
	writecsv(titles,brands,prices,asins,recommended_items,page_no)

def find_title_brand_price(soup):
	title = None
	brand = None
	price = None
	asin = None
	title_raw = soup.select("#productTitle")
	brand_raw = soup.select("#brand")
	price_raw = soup.select("#priceblock_ourprice")
	asin_raw = soup.select("#detail-bullets")
	max_recs = soup.select("#purchase-sims-feature span.a-carousel-page-max")

	if not (title_raw or brand_raw) :
		return title,brand,price,asin
	else:
		try :
			title = title_raw[0].string.strip()
			brand = brand_raw[0].string.strip()
			if not asin_raw:
				asin_raw = soup.select("#productDetails_detailBullets_sections1 .a-size-base")
				asin = asin_raw[1].string.strip()
			else:
				asin = get_asin(asin_raw)
			if not price_raw:
				price_raw = soup.select("#priceblock_saleprice")
			price = price_raw[0].string.strip()
			return title,brand,price,asin
		except:
			return title,brand,price,asin

def get_asin(soup_raw):
	asin_find = re.search(r'.*ASIN:.*[\w]{10}', soup_raw[0].text)
	if asin_find:
		#ASIN: B01HRNEHRE
		return asin_find.group(0).split(" ")[1]
	else :
		print("asin not found :(")
		return None

def writecsv(titles,brands,prices,asins,rec_items,page_no):

	print("start writing {} rec_items to csv".format(len(rec_items)))
	rec_items_name = ["rec_item{}_name".format(i) for i in range(1,81) ]
	rec_items_brands = ["rec_item{}_brand".format(i) for i in range(1,81) ]
	rec_items_price = ["rec_item{}_price".format(i) for i in range(1,81) ]
	rec_items_asin = ["rec_item{}_asin".format(i) for i in range(1,81) ]
	rec_items_fieldnames = create_rec_items_header(rec_items_name,rec_items_brands,rec_items_price,rec_items_asin)
	fieldnames = ['item_num','item_name', 'item_brand','item_price','item_asin']
	fieldnames.extend(rec_items_fieldnames)

	for key in rec_items:
		print("key : {}".format(key))
	with open('amazon_'+str(page_no)+'.csv', 'a', newline='') as csvfile:
		writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
		writer.writeheader()
		for index_main, title in enumerate(titles):
			rec_items_one = rec_items.get(asins[index_main])
			rec_items_titles_list = rec_items_one.get("titles")
			rec_items_brands_list = rec_items_one.get("brands")
			rec_items_prices_list = rec_items_one.get("prices")
			rec_items_asins_list = rec_items_one.get("asins")
			row = {'item_num': str(index_main+1), 'item_name': title,'item_brand': brands[index_main],'item_price': prices[index_main],'item_asin': asins[index_main]}
			for index, rec_title in enumerate(rec_items_titles_list):
				row["rec_item{}_name".format(index+1)] = rec_title
				row["rec_item{}_brand".format(index+1)] = rec_items_brands_list[index]
				row["rec_item{}_price".format(index+1)] = rec_items_prices_list[index]
				row["rec_item{}_asin".format(index+1)] = rec_items_asins_list[index]

			print("writing row : {}".format(index_main+1))
			writer.writerow(row)

def create_rec_items_header(names,brands,prices,asins):
	result = []
	for i in range(0,30):
		result.append(names[i])
		result.append(brands[i])
		result.append(prices[i])
		result.append(asins[i])
	return result


def initiate_webdriver(href,item_raw_links):
	print("initiate_webdriver for : {}".format(href))
	result = []
	result.extend(item_raw_links)
	driver = webdriver.Chrome(CHROME_WEBDRIVER_PATH)
	driver.implicitly_wait(10)
	driver.get(href)
	print("wait...... fetching data")
	driver.execute_script("window.scrollTo(0, 1200);")
	time.sleep(2)
	next_button = None
	element = None
	second_case = False
	try:
		next_button = driver.find_element_by_css_selector("#purchase-sims-feature a.a-carousel-goto-nextpage")
	except:
		print("get next_button after scrolling to 3000px more!!")
		second_case = True
		driver.execute_script("window.scrollTo(0, 3000);")
		time.sleep(2)
		next_button = driver.find_element_by_css_selector("#day0-sims-feature a.a-carousel-goto-nextpage")
	print("next_button_raw_soup : {}".format(next_button))

	for i in range(6,80,5):
		if len(result) > 95:
			print("fetched 40 rec items")
			break
		try :
			print("click executed for {}".format(i))
			next_button.click()
			time.sleep(3)
			html = driver.page_source
			soup = BeautifulSoup(html,"html5lib")
			if second_case:
				recommended_items_raw = soup.select("#day0-sims-feature .a-carousel-viewport li a")
			else :
				recommended_items_raw = soup.select("#purchase-sims-feature .a-carousel-viewport li a")
			print("fetched recommended_items using webdriver: {}".format(len(recommended_items_raw)))
			result.extend(recommended_items_raw)
			time.sleep(2)
		except:
			print("next_button is click failed - so try again")
			time.sleep(3)
			try:
				next_button = driver.find_element_by_css_selector("#purchase-sims-feature a.a-carousel-goto-nextpage")
			except:
				second_case = True
				driver.execute_script("window.scrollTo(0, 3000);")
				time.sleep(2)
				next_button = driver.find_element_by_css_selector("#day0-sims-feature a.a-carousel-goto-nextpage")


	driver.quit()
	print("initiate_webdriver results : {}".format(len(result)))
	return result


if __name__ == '__main__':
	main()
	from bs4 import BeautifulSoup
	import requests
	import re
	import csv
	import time
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC

	"""

	example_url : https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dbeauty&field-keywords=lipstick&page=1

	Returns total number of recommended_items for a single searched item
	<Every page has 48 search results>

	Installation

	1- Download chromedriver_mac64.zip from http://chromedriver.storage.googleapis.com/index.html?path=2.24/
	2- Unzip and place in the same folder with code and note down the path.
	3- Repace CHROME_WEBDRIVER_PATH with Path on your computer for chromedriver

	How to Run (python 3)

	$ python -m venv amazonvenv
	$ source amazonvenv\Scripts\activate
	$ pip install requests
	$ pip install beautifulsoup4
	$ pip install -U selenium
	$ pip install html5lib
	$ python crawling.py


	"""

	BASE_URL = "https://www.amazon.com/s/ref=nb_sb_noss_2"
	KEYWORD = "lipstick"
	headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
	CHROME_WEBDRIVER_PATH = '/Users/hassanabid/Documents/hassan/GDE_code/web_crawling_amazon/chromedriver'
	START_PAGE = 1
	END_PAGE = 2

	def main():

	payload_main = {'url': "search-alias%3Dbeauty",'field-keywords': KEYWORD,"page":1}
	r_amazon = requests.get(BASE_URL, params=payload_main,headers=headers)
	soup = BeautifulSoup(r_amazon.text,'html5lib')
	count_raw = soup.select("#s-result-count")

	if count_raw:
	result = re.search(r'[\d]+.[\d]+.*[\d]+.[\d]+', count_raw[0].text)
	if result:
	total_pages_re = re.search(r'[\d]+,[\d]+',result.group())
	total_pages_count = int(total_pages_re.group().replace(",",""))/48
	print("Total_Page_Count : {}".format(int(total_pages_count)))
	for i in range(START_PAGE,END_PAGE+1):
	item_links_raw = get_next_page(i)
	items_soup,items_links = get_items_soup(total_pages_count,item_links_raw)
	find_recommended_items(items_soup,items_links,i)
	else :
	print("nothing found for {}".format(r_amazon.url))


	def get_next_page(page_no):

	payload_main = {'url': "search-alias%3Dbeauty",'field-keywords': KEYWORD,"page":page_no}
	r_amazon = requests.get(BASE_URL, params=payload_main,headers=headers)
	soup = BeautifulSoup(r_amazon.text,'html5lib')
	count_raw = soup.select("#s-result-count")
	item_links_raw = soup.select("#resultsCol .a-row .a-spacing-top-mini > a")
	print("Page No : {} - len(item_links) : {}".format(page_no,len(item_links_raw)))
	return item_links_raw

	def get_items_soup(total_pages_count,item_links_raw):
	items_soup = []
	valid_item_links = []
	for item in item_links_raw:
	single_item_link = item["href"]
	try:
	r_single_item = requests.get(single_item_link,headers=headers)
	soup = BeautifulSoup(r_single_item.text,"html5lib")
	items_soup.append(soup)
	print("fetched item no. {}".format(len(items_soup)))
	valid_item_links.append(single_item_link)
	except:
	print("couldn't find!")
	return items_soup,valid_item_links

	def find_recommended_items(items_soup,item_links,page_no):
	key_items = 0
	titles = []
	brands = []
	prices = []
	asins = []
	recommended_items = {}
	for index, soup in enumerate(items_soup):
	main_title, brand, price, main_asin = find_title_brand_price(soup)
	if not main_title:
	# move this below and add a case when title is not found. (empty [] in dict)
	print("title not found for item :{} - link: {}".format(key_items,item_links[index]))
	continue
	else :
	key_items = index + 1
	titles.append(main_title)
	brands.append(brand)
	prices.append(price)
	asins.append(main_asin)
	recommended_items_raw = soup.select("#purchase-sims-feature .a-carousel-viewport li a")
	print("fetching recommended_items (count: {}) for {}".format(len(recommended_items_raw),key_items))
	rec_items_href = []
	rec_items_titles = []
	rec_items_brands = []
	rec_items_prices = []
	rec_items_asins = []
	recommended_items_raw_extended = initiate_webdriver(item_links[index],recommended_items_raw)
	recommended_items_raw_extended = recommended_items_raw_extended

	for item in recommended_items_raw_extended:
	try :
	# print("item : {}".format(item["href"]))
	if not "product-reviews" in item["href"]:
	rec_items_href.append(item["href"])
	except KeyError:
	print("no link found")
	print("found {} valid links in rec_items for {}".format(len(rec_items_href),key_items))
	rec_items_href = rec_items_href[0:31]
	print("fetch title, brand, price and asin for rec items")
	for single_href in rec_items_href:
	try:
	r_single_rec_item = requests.get("https://www.amazon.com{}".format(single_href),headers=headers)
	soup = BeautifulSoup(r_single_rec_item.text,"html5lib")
	title,brand,price,asin = find_title_brand_price(soup)
	if title and brand:
	if not title in rec_items_titles:
	# print("title : {}".format(len(title)))
	rec_items_titles.append(title)
	rec_items_brands.append(brand)
	rec_items_prices.append(price)
	rec_items_asins.append(asin)
	except:
	print("couldn't fetch single_href :{}".format(single_href))

	items_dict = {"titles" : rec_items_titles,"brands":rec_items_brands,"prices":rec_items_prices,"asins":rec_items_asins}
	recommended_items[main_asin] = items_dict
	print("recorded recommended items for key:{} - values:{}".format(main_asin,len(recommended_items)))

	# write to csv
	writecsv(titles,brands,prices,asins,recommended_items,page_no)

	def find_title_brand_price(soup):
	title = None
	brand = None
	price = None
	asin = None
	title_raw = soup.select("#productTitle")
	brand_raw = soup.select("#brand")
	price_raw = soup.select("#priceblock_ourprice")
	asin_raw = soup.select("#detail-bullets")
	max_recs = soup.select("#purchase-sims-feature span.a-carousel-page-max")

	if not (title_raw or brand_raw) :
	return title,brand,price,asin
	else:
	try :
	title = title_raw[0].string.strip()
	brand = brand_raw[0].string.strip()
	if not asin_raw:
	asin_raw = soup.select("#productDetails_detailBullets_sections1 .a-size-base")
	asin = asin_raw[1].string.strip()
	else:
	asin = get_asin(asin_raw)
	if not price_raw:
	price_raw = soup.select("#priceblock_saleprice")
	price = price_raw[0].string.strip()
	return title,brand,price,asin
	except:
	return title,brand,price,asin

	def get_asin(soup_raw):
	asin_find = re.search(r'.ASIN:.[\w]{10}', soup_raw[0].text)
	if asin_find:
	#ASIN: B01HRNEHRE
	return asin_find.group(0).split(" ")[1]
	else :
	print("asin not found :(")
	return None

	def writecsv(titles,brands,prices,asins,rec_items,page_no):

	print("start writing {} rec_items to csv".format(len(rec_items)))
	rec_items_name = ["rec_item{}_name".format(i) for i in range(1,81) ]
	rec_items_brands = ["rec_item{}_brand".format(i) for i in range(1,81) ]
	rec_items_price = ["rec_item{}_price".format(i) for i in range(1,81) ]
	rec_items_asin = ["rec_item{}_asin".format(i) for i in range(1,81) ]
	rec_items_fieldnames = create_rec_items_header(rec_items_name,rec_items_brands,rec_items_price,rec_items_asin)
	fieldnames = ['item_num','item_name', 'item_brand','item_price','item_asin']
	fieldnames.extend(rec_items_fieldnames)

	for key in rec_items:
	print("key : {}".format(key))
	with open('amazon_'+str(page_no)+'.csv', 'a', newline='') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	for index_main, title in enumerate(titles):
	rec_items_one = rec_items.get(asins[index_main])
	rec_items_titles_list = rec_items_one.get("titles")
	rec_items_brands_list = rec_items_one.get("brands")
	rec_items_prices_list = rec_items_one.get("prices")
	rec_items_asins_list = rec_items_one.get("asins")
	row = {'item_num': str(index_main+1), 'item_name': title,'item_brand': brands[index_main],'item_price': prices[index_main],'item_asin': asins[index_main]}
	for index, rec_title in enumerate(rec_items_titles_list):
	row["rec_item{}_name".format(index+1)] = rec_title
	row["rec_item{}_brand".format(index+1)] = rec_items_brands_list[index]
	row["rec_item{}_price".format(index+1)] = rec_items_prices_list[index]
	row["rec_item{}_asin".format(index+1)] = rec_items_asins_list[index]

	print("writing row : {}".format(index_main+1))
	writer.writerow(row)

	def create_rec_items_header(names,brands,prices,asins):
	result = []
	for i in range(0,30):
	result.append(names[i])
	result.append(brands[i])
	result.append(prices[i])
	result.append(asins[i])
	return result


	def initiate_webdriver(href,item_raw_links):
	print("initiate_webdriver for : {}".format(href))
	result = []
	result.extend(item_raw_links)
	driver = webdriver.Chrome(CHROME_WEBDRIVER_PATH)
	driver.implicitly_wait(10)
	driver.get(href)
	print("wait...... fetching data")
	driver.execute_script("window.scrollTo(0, 1200);")
	time.sleep(2)
	next_button = None
	element = None
	second_case = False
	try:
	next_button = driver.find_element_by_css_selector("#purchase-sims-feature a.a-carousel-goto-nextpage")
	except:
	print("get next_button after scrolling to 3000px more!!")
	second_case = True
	driver.execute_script("window.scrollTo(0, 3000);")
	time.sleep(2)
	next_button = driver.find_element_by_css_selector("#day0-sims-feature a.a-carousel-goto-nextpage")
	print("next_button_raw_soup : {}".format(next_button))

	for i in range(6,80,5):
	if len(result) > 95:
	print("fetched 40 rec items")
	break
	try :
	print("click executed for {}".format(i))
	next_button.click()
	time.sleep(3)
	html = driver.page_source
	soup = BeautifulSoup(html,"html5lib")
	if second_case:
	recommended_items_raw = soup.select("#day0-sims-feature .a-carousel-viewport li a")
	else :
	recommended_items_raw = soup.select("#purchase-sims-feature .a-carousel-viewport li a")
	print("fetched recommended_items using webdriver: {}".format(len(recommended_items_raw)))
	result.extend(recommended_items_raw)
	time.sleep(2)
	except:
	print("next_button is click failed - so try again")
	time.sleep(3)
	try:
	next_button = driver.find_element_by_css_selector("#purchase-sims-feature a.a-carousel-goto-nextpage")
	except:
	second_case = True
	driver.execute_script("window.scrollTo(0, 3000);")
	time.sleep(2)
	next_button = driver.find_element_by_css_selector("#day0-sims-feature a.a-carousel-goto-nextpage")


	driver.quit()
	print("initiate_webdriver results : {}".format(len(result)))
	return result




	if __name__ == '__main__':
	main()