You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
https://www.amazon.in/s?k=books&s=price-asc-rank # low to high
https://www.amazon.in/s?k=books&s=price-desc-rank # high to low
https://www.amazon.in/s?k=books&s=relevanceblender # featured
https://www.amazon.in/s?k=books&s=review-rank # Average customer review
https://www.amazon.in/s?k=books&s=date-desc-rank # Newest Arrival
Version 2
Link to item | Item Name | Review count | Item Rating | Image download links | Item Availability | Price
Data is getting stored in a csv file and Highest Resolution image of product is saved where script is running
To direct this values in your some other database you can simply return them or handle in any manner just before the csv_append is done
#!/usr/bin/python3# -*- coding: utf-8 -*-# This code Scraps a single query from Amazon | Purpose modifedfrombs4importBeautifulSoupassoupimportrequestsimportosimportcsv# vars with GLOBAL scopebot_query="https://www.amazon.in/s?k="# Defualt order, no filters are being appliedHEADER= {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
PROXY= {"https":"138.0.207.18:58566"}# {"https": "https//124.107.229.210:8080", "http":"http//124.107.229.210:8080"} | Proxy Rotation is absent | pip package will be released soonercsv_index=0# To keep propering indexing when adding data from different pages## Error handling needed when image sizes mismatchdefcsv_append(user_query='-', item_count='-', name='-', item_page='-', rating='-', review='-', is1='-', is15='-', is2='-', is25='-', is3='-', price='-'):
globalis_firstglobalcsv_indexcsv_index+=1item_count=csv_index# Overtakecsv_name=user_query.replace('+', '-') +'.csv'ifis_first:
withopen(csv_name, 'w', newline='', encoding="utf-8") asf:
writer=csv.writer(f)
writer.writerow(["SN", "Name", "Link to item", "Rating", "Reviews", "Image Download link 1x", "Image Download link 1.5x", "Image Download link 2x", "Image Download link 2.5x", "Image Download link 3x", "Price"])
writer.writerow([item_count, name, item_page, rating, review.replace(',', ''), is1, is15, is2, is25, is3, price])
else:
withopen(csv_name, 'a+', newline='', encoding="utf-8") asf:
writer=csv.writer(f)
writer.writerow([item_count, name, item_page, rating, review.replace(',', ''), is1, is15, is2, is25, is3, price])
defslugify(img_name):
""" Slug Generator with image format :param1: image-name :return: slugged and format typed image """sluged_name=img_name.replace(' ', '-').replace('\"', '').replace('(', ' ').replace(')', ' ')
sluged_name=sluged_name.replace('|', ' ').replace('&', ' ').replace(':', ' ') # ASCII Range can be checked instead, More info needed about how will the images \# be used.return"{}.{}".format(sluged_name, 'jpg')
defimg_download(url, img_name):
""" Downloads the image from given url :param1: url :param2: imgage-name """img_name=slugify(img_name).encode('ascii', 'ignore')
img=requests.get(url, headers=HEADER)
withopen(img_name, 'wb') asf: # Put them in folder properlyf.write(img.content)
defscrape_bundle(user_query, page_count):
globalis_firstis_first=Trueforcountinrange(1, int(page_count)+1):
scrape_query(user_query, count)
print(count)
is_first=Falsedefscrape_query(user_query, page_count):
""" Scrape and store the details of first page in csv that is when we search an item on amazon. Image is also downloaded. :param1: Name of item :return: None if no item appears Scraped Data:: Link to item | Item Name | Review count | Item Rating | Image download links | Item Availability | Price """globalis_firstamazon_query=bot_query+user_query+'&'+'page='+str(page_count)
client=requests.get(amazon_query, headers=HEADER) # proxies=PROXY | gimmeproxypage_html=client.textpage_soup=soup(page_html, "html.parser")
container=page_soup.findAll("div", {"class":"s-result-item"})
bundled_data= []
## Function can be made for below code | too much arg passing## Better Error Handling is needed | can be avoided by key check# Taking only first item | Run complete loop to get all single page resultsiflen(container) >0:
item_count=0foritemincontainer:
item_count+=1item_page='https://www.amazon.in'+item.a['href']
try:
image_srcset= [(img_link.split(' ')[0], img_link.split(' ')[1]) forimg_linkinitem.a.img['srcset'].split(', ')]
except:
image_srcset=Noneimg_alt=item.img['alt'] # item-nametry:
rating=item.span.find("span", {"class":"a-icon-alt"}).textreview_count=item.span.find("span", {"class":"a-size-base"}).textexcept:
rating='-'review_count='-'# Availabilityforxinitem.span.find_all("span"):
ifx.text=="Currently unavailable.":
availability="Currently unavailable."else:
availability="Available"price=item.span.find("span", {"class":"a-price-whole"})
ifprice!=None:
price=price.textelse:
price='-'# Downloading most HR imagetry:
img_download(url=image_srcset[-1][0], img_name=img_alt)
exceptExceptionase:
print(str(e))
try:
is1=image_srcset[0][0]
is15=image_srcset[1][0]
is2=image_srcset[2][0]
is25=image_srcset[3][0]
except:
is1='-'is15='-'is2='-'is25='-'try: # Maybe 1 Image we haveis3=image_srcset[-1][0]
except:
is3='-'csv_append(user_query=user_query, item_count=item_count, name=img_alt, item_page=item_page, rating=rating, review=review_count, is1=is1, is15=is15, is2=is2, is25=is25, is3=is3, price=price)
is_first=Falseelse:
item=Nonereturnitemif__name__=="__main__":
user_query=input("Enter your Product: ").replace(' ', '+') # More filtering is needed | search 'mobile' to get all mobile data scrapedpage_count=input("Scrape upto how many pages: ")
ifpage_countin ['0', ' ', None]:
scrape_bundle(user_query, '1')
else:
scrape_bundle(user_query, page_count)
Version 1
Link to item | Item Name | Review count | Item Rating | Image download links | Item Availability | Price
#!/usr/bin/python3# -*- coding: utf-8 -*-# This code Scraps a single query from Amazon | Purpose modifed, not targeting image at nowfrombs4importBeautifulSoupassoupimportrequestsimportosimportcsv# vars with GLOBAL scopebot_query="https://www.amazon.in/s?k="# Defualt order, no filters are being appliedHEADER= {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'}
PROXY= {"https":"138.0.207.18:58566"}# {"https": "https//124.107.229.210:8080", "http":"http//124.107.229.210:8080"} | Proxy Rotation is absent | pip package will be released soonercsv_index=0# To keep propering indexing when adding data from different pages## Error handling needed when image sizes mismatchdefcsv_append(user_query='-', item_count='-', name='-', item_page='-', rating='-', review='-', is1='-', is15='-', is2='-', is25='-', is3='-'):
globalis_firstglobalcsv_indexcsv_index+=1item_count=csv_index# Overtakecsv_name=user_query.replace('+', '-') +'.csv'ifis_first:
withopen(csv_name, 'w', newline='') asf:
writer=csv.writer(f)
writer.writerow(["SN", "Name", "Link to item", "Rating", "Reviews", "Image Download link 1x", "Image Download link 1.5x", "Image Download link 2x", "Image Download link 2.5x", "Image Download link 3x"])
writer.writerow([item_count, name, item_page, rating, review.replace(',', ''), is1, is15, is2, is25, is3])
else:
withopen(csv_name, 'a+', newline='') asf:
writer=csv.writer(f)
writer.writerow([item_count, name, item_page, rating, review.replace(',', ''), is1, is15, is2, is25, is3])
defslugify(img_name):
""" Slug Generator with image format :param1: image-name :return: slugged and format typed image """sluged_name=img_name.replace(' ', '-').replace('\"', '').replace('(', ' ').replace(')', ' ')
return"{}.{}".format(sluged_name, 'jpg')
defimg_download(url, img_name):
""" Downloads the image from given url :param1: url :param2: imgage-name """img_name=slugify(img_name)
img=requests.get(url, headers=HEADER)
withopen(img_name, 'wb') asf: # Put them in folder properlyf.write(img.content)
defscrape_bundle(user_query, page_count):
globalis_firstis_first=Trueforcountinrange(1, int(page_count)+1):
scrape_query(user_query, count)
print(count)
is_first=Falsedefscrape_query(user_query, page_count):
""" Scrape and store the details of first page in csv that is when we search an item on amazon. Image is also downloaded. :param1: Name of item :return: None if no item appears Scraped Data:: Link to item | Item Name | Review count | Item Rating | Image download links | Item Availability """globalis_firstamazon_query=bot_query+user_query+'&'+'page='+str(page_count)
client=requests.get(amazon_query, headers=HEADER) # proxies=PROXY | gimmeproxypage_html=client.textpage_soup=soup(page_html, "html.parser")
container=page_soup.findAll("div", {"class":"s-result-item"})
bundled_data= []
## Function can be made for below code | too much arg passing## Better Error Handling is needed | can be avoided by key check# Taking only first item | Run complete loop to get all single page resultsiflen(container) >0:
item_count=0foritemincontainer:
item_count+=1item_page='https://www.amazon.in'+item.a['href']
image_srcset= [(img_link.split(' ')[0], img_link.split(' ')[1]) forimg_linkinitem.a.img['srcset'].split(', ')]
img_alt=item.img['alt'] # item-nametry:
rating=item.span.find("span", {"class":"a-icon-alt"}).textreview_count=item.span.find("span", {"class":"a-size-base"}).textexcept:
rating='-'review_count='-'# Availabilityforxinitem.span.find_all("span"):
ifx.text=="Currently unavailable.":
availability="Currently unavailable."else:
availability="Available"# Downloading most HR imagetry:
img_download(url=image_srcset[-1][0], img_name=img_alt)
exceptExceptionase:
print(str(e))
try:
is1=image_srcset[0][0]
is15=image_srcset[1][0]
is2=image_srcset[2][0]
is25=image_srcset[3][0]
try:
is3=image_srcset[-1][0]
except:
is3='-'except:
is1='-'is15='-'is2='-'is25='-'csv_append(user_query=user_query, item_count=item_count, name=img_alt, item_page=item_page, rating=rating, review=review_count, is1=is1, is15=is15, is2=is2, is25=is25, is3=is3)
is_first=Falseelse:
item=Nonereturnitemif__name__=="__main__":
user_query=input("Enter your Product: ").replace(' ', '+') # More filtering is needed | search 'mobile' to get all mobile data scrapedpage_count=input("Scrape upto how many pages: ")
print(page_count)
ifpage_countin ['0', ' ', None]:
print('me 0 pe chalunga')
scrape_bundle(user_query, '1')
else:
print('me chala')
scrape_bundle(user_query, page_count)