dot1mav/digikala_treasure_hunt.py

## digikala_treasure_hunt.py
"""
write by dot1mav
req:
autopep8==1.6.0
beautifulsoup4==4.10.0
certifi==2021.10.8
charset-normalizer==2.0.8
colorama==0.4.4
commonmark==0.9.1
idna==3.3
lxml==4.6.4
pycodestyle==2.8.0
Pygments==2.10.0
requests==2.26.0
rich==10.14.0
soupsieve==2.3.1
toml==0.10.2
tqdm==4.62.3
urllib3==1.26.7
"""
from os import mkdir
from os.path import isdir
from bs4 import BeautifulSoup
from requests import get
from time import sleep
from json import loads
from shutil import copyfileobj
from tqdm import tqdm
from threading import Thread


def get_page_number(url):
    response = get(url.format(1))
    html = BeautifulSoup(response.text, 'lxml')
    page_number = html.select(
        'div.c-pager>ul>li.js-pagination__item')[-1].find('a').get('data-page')
    return page_number


def scrap_product(url):
    img_links = []
    try:
        response = get(url)
    except:
        sleep(2)
        return scrap_product(url)
    if response.status_code != 200:
        sleep(2)
        return scrap_product(url)
    html = BeautifulSoup(response.text, 'lxml')
    imgs = html.find_all(
        'div', class_='c-remodal-gallery__thumb js-image-thumb')
    for img in imgs:
        if img:
            img_links.append(img.find('img').get('data-src').split('?')[0])
    return img_links


def scrap_page(n, url):
    digi_link = 'https://digikala.com{}'
    if not isdir('scrap-img/{}'.format(n)):
        mkdir('scrap-img/{}'.format(n))
        try:
            response = get(url)
        except:
            sleep(2)
            return scrap_page(n, url)
        if response.status_code != 200:
            sleep(2)
            return scrap_page(n, url)
        html = BeautifulSoup(response.text, 'lxml')
        products = html.find_all(
            'div', class_='c-product-box')
        progress_bar = tqdm(desc='page {}'.format(n), total=len(
            products), ncols=len(products), dynamic_ncols=True, position=int(n))
        for product in products:
            details = loads(str(product.get('data-enhanced-ecommerce')))
            if not isdir('scrap-img/{}/{}'.format(n, details['id'])):
                mkdir('scrap-img/{}/{}'.format(n, details['id']))
                link = product.find(
                    'a', class_='c-product-box__img').get('href')
                photos = scrap_product(digi_link.format(link))
                for photo in photos:
                    res = get(photo, stream=True)
                    with open('scrap-img/{}/{}/{}'.format(n, details['id'], photo.split('/')[-1]), 'wb') as f:
                        copyfileobj(res.raw, f)
                with open('scrap-img/{}/{}/details.txt'.format(n, details['id']), 'w', encoding="utf-8") as f:
                    for key in details:
                        f.write('{}:{}\n'.format(key, details[key]))
                    f.write('link:{}\n'.format(link))
                    for photo in photos:
                        f.write('photo_link:{}\n'.format(photo))
                progress_bar.update(1)
                sleep(0.8)


if __name__ == '__main__':
    if not isdir('scrap-img'):
        mkdir('scrap-img')
    url = "https://www.digikala.com/treasure-hunt/products/?pageno={}"
    page_number = get_page_number(url)
    pages = [(i, url.format(i)) for i in range(1, int(page_number) + 1)]
    threads = []
    for p in pages:
        threads.append(Thread(target=scrap_page, args=p, name='page {}'.format(p[0])))
    for th in threads:
        th.start()
        th.join()
	"""
	write by dot1mav
	req:
	autopep8==1.6.0
	beautifulsoup4==4.10.0
	certifi==2021.10.8
	charset-normalizer==2.0.8
	colorama==0.4.4
	commonmark==0.9.1
	idna==3.3
	lxml==4.6.4
	pycodestyle==2.8.0
	Pygments==2.10.0
	requests==2.26.0
	rich==10.14.0
	soupsieve==2.3.1
	toml==0.10.2
	tqdm==4.62.3
	urllib3==1.26.7
	"""
	from os import mkdir
	from os.path import isdir
	from bs4 import BeautifulSoup
	from requests import get
	from time import sleep
	from json import loads
	from shutil import copyfileobj
	from tqdm import tqdm
	from threading import Thread


	def get_page_number(url):
	response = get(url.format(1))
	html = BeautifulSoup(response.text, 'lxml')
	page_number = html.select(
	'div.c-pager>ul>li.js-pagination__item')[-1].find('a').get('data-page')
	return page_number


	def scrap_product(url):
	img_links = []
	try:
	response = get(url)
	except:
	sleep(2)
	return scrap_product(url)
	if response.status_code != 200:
	sleep(2)
	return scrap_product(url)
	html = BeautifulSoup(response.text, 'lxml')
	imgs = html.find_all(
	'div', class_='c-remodal-gallery__thumb js-image-thumb')
	for img in imgs:
	if img:
	img_links.append(img.find('img').get('data-src').split('?')[0])
	return img_links


	def scrap_page(n, url):
	digi_link = 'https://digikala.com{}'
	if not isdir('scrap-img/{}'.format(n)):
	mkdir('scrap-img/{}'.format(n))
	try:
	response = get(url)
	except:
	sleep(2)
	return scrap_page(n, url)
	if response.status_code != 200:
	sleep(2)
	return scrap_page(n, url)
	html = BeautifulSoup(response.text, 'lxml')
	products = html.find_all(
	'div', class_='c-product-box')
	progress_bar = tqdm(desc='page {}'.format(n), total=len(
	products), ncols=len(products), dynamic_ncols=True, position=int(n))
	for product in products:
	details = loads(str(product.get('data-enhanced-ecommerce')))
	if not isdir('scrap-img/{}/{}'.format(n, details['id'])):
	mkdir('scrap-img/{}/{}'.format(n, details['id']))
	link = product.find(
	'a', class_='c-product-box__img').get('href')
	photos = scrap_product(digi_link.format(link))
	for photo in photos:
	res = get(photo, stream=True)
	with open('scrap-img/{}/{}/{}'.format(n, details['id'], photo.split('/')[-1]), 'wb') as f:
	copyfileobj(res.raw, f)
	with open('scrap-img/{}/{}/details.txt'.format(n, details['id']), 'w', encoding="utf-8") as f:
	for key in details:
	f.write('{}:{}\n'.format(key, details[key]))
	f.write('link:{}\n'.format(link))
	for photo in photos:
	f.write('photo_link:{}\n'.format(photo))
	progress_bar.update(1)
	sleep(0.8)


	if __name__ == '__main__':
	if not isdir('scrap-img'):
	mkdir('scrap-img')
	url = "https://www.digikala.com/treasure-hunt/products/?pageno={}"
	page_number = get_page_number(url)
	pages = [(i, url.format(i)) for i in range(1, int(page_number) + 1)]
	threads = []
	for p in pages:
	threads.append(Thread(target=scrap_page, args=p, name='page {}'.format(p[0])))
	for th in threads:
	th.start()
	th.join()