Skip to content

Instantly share code, notes, and snippets.

@FFC12
Last active May 23, 2021 06:51
Show Gist options
  • Save FFC12/21067cb2791e04b7e5fe51e491f318d4 to your computer and use it in GitHub Desktop.
Save FFC12/21067cb2791e04b7e5fe51e491f318d4 to your computer and use it in GitHub Desktop.
Amazon Scraper Demo
"""
Author: FFC12
Date: 22.05.2021
Task:
*This script scrapes product titles and ASINs from a particular link on Amazon.
*Bu script Amazon'dan belirli linklerdeki ürün başıklarını ve ASIN numaralarını çekip PostgresSQL'e kaydeder.
Note: Change the 'db_username' and 'db_pass' for yourself.
*Requirements:
- beautifulsoup
- sqlalchemy
- fake_useragent
- psycopg2
"""
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from fake_useragent import UserAgent
class DBDemoDriver:
def __init__(self,db_username,db_pass):
#Select default database
self.db = create_engine('postgresql://'+db_username+':'+db_pass+'@localhost/postgres')
#Create table if not exists as amazon_recap
self.db.execute("""CREATE TABLE IF NOT EXISTS amazon_recap (asin text,title text)""")
class AmazonDemoScraper:
def __init__(self,base_url,from_page=1) -> None:
self.base_url = base_url + "&pg="
self.start_page = from_page
self.ua = UserAgent()
self.driver = DBDemoDriver('postgres','root')
def run(self,debug=False,to_page=2):
if to_page < 1:
raise Exception("Cannot be less than 1 page.")
for n in range(self.start_page,to_page + 1):
page = requests.get(self.base_url + str(n),headers={"User-Agent":self.ua.random})
soup = BeautifulSoup(page.content,"lxml",from_encoding='UTF-8')
elements = soup.find_all("span",attrs={"class":"aok-inline-block zg-item"})
if debug:
print("[#] Debugging mode")
for element in elements:
sub_element = element.find("a",attrs={"class":"a-link-normal"})
asin = sub_element.get("href")
asin = asin.split('/')[3]
title = sub_element.find_all("div")[1]
if debug:
print("ASIN: " + asin + "\nProduct Title: " + title.text)
self.driver.db.execute("""INSERT INTO amazon_recap (asin,title) VALUES ('%s', '%s')"""
% (str(asin),
str(title.text)
.lstrip()
.replace('\n', ' ')
.replace('\r', '')
.replace('\t',' ')
.replace(r"'",r"''")))
if __name__ == "__main__":
scraper = AmazonDemoScraper("https://www.amazon.com.tr/gp/new-releases/kitchen/ref=zg_bsnr_pg_1?ie=UTF8",from_page=1)
scraper.run(debug=True,to_page=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment