Last active
May 23, 2021 06:51
-
-
Save FFC12/21067cb2791e04b7e5fe51e491f318d4 to your computer and use it in GitHub Desktop.
Amazon Scraper Demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Author: FFC12 | |
Date: 22.05.2021 | |
Task: | |
*This script scrapes product titles and ASINs from a particular link on Amazon. | |
*Bu script Amazon'dan belirli linklerdeki ürün başıklarını ve ASIN numaralarını çekip PostgresSQL'e kaydeder. | |
Note: Change the 'db_username' and 'db_pass' for yourself. | |
*Requirements: | |
- beautifulsoup | |
- sqlalchemy | |
- fake_useragent | |
- psycopg2 | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from sqlalchemy import create_engine | |
from fake_useragent import UserAgent | |
class DBDemoDriver: | |
def __init__(self,db_username,db_pass): | |
#Select default database | |
self.db = create_engine('postgresql://'+db_username+':'+db_pass+'@localhost/postgres') | |
#Create table if not exists as amazon_recap | |
self.db.execute("""CREATE TABLE IF NOT EXISTS amazon_recap (asin text,title text)""") | |
class AmazonDemoScraper: | |
def __init__(self,base_url,from_page=1) -> None: | |
self.base_url = base_url + "&pg=" | |
self.start_page = from_page | |
self.ua = UserAgent() | |
self.driver = DBDemoDriver('postgres','root') | |
def run(self,debug=False,to_page=2): | |
if to_page < 1: | |
raise Exception("Cannot be less than 1 page.") | |
for n in range(self.start_page,to_page + 1): | |
page = requests.get(self.base_url + str(n),headers={"User-Agent":self.ua.random}) | |
soup = BeautifulSoup(page.content,"lxml",from_encoding='UTF-8') | |
elements = soup.find_all("span",attrs={"class":"aok-inline-block zg-item"}) | |
if debug: | |
print("[#] Debugging mode") | |
for element in elements: | |
sub_element = element.find("a",attrs={"class":"a-link-normal"}) | |
asin = sub_element.get("href") | |
asin = asin.split('/')[3] | |
title = sub_element.find_all("div")[1] | |
if debug: | |
print("ASIN: " + asin + "\nProduct Title: " + title.text) | |
self.driver.db.execute("""INSERT INTO amazon_recap (asin,title) VALUES ('%s', '%s')""" | |
% (str(asin), | |
str(title.text) | |
.lstrip() | |
.replace('\n', ' ') | |
.replace('\r', '') | |
.replace('\t',' ') | |
.replace(r"'",r"''"))) | |
if __name__ == "__main__": | |
scraper = AmazonDemoScraper("https://www.amazon.com.tr/gp/new-releases/kitchen/ref=zg_bsnr_pg_1?ie=UTF8",from_page=1) | |
scraper.run(debug=True,to_page=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment