Last active
December 29, 2023 13:23
-
-
Save dot1mav/22106a5e01de412d65963687c81bb6aa to your computer and use it in GitHub Desktop.
جمع آوری کلیه عکس های شکار گنج دیجی کالا #دیجیکالا #شکار_گنچ #شکار_گنج_دیجیکالا #treasure-hunt https://www.digikala.com/landings/treasure-hunt/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
write by dot1mav | |
req: | |
autopep8==1.6.0 | |
beautifulsoup4==4.10.0 | |
certifi==2021.10.8 | |
charset-normalizer==2.0.8 | |
colorama==0.4.4 | |
commonmark==0.9.1 | |
idna==3.3 | |
lxml==4.6.4 | |
pycodestyle==2.8.0 | |
Pygments==2.10.0 | |
requests==2.26.0 | |
rich==10.14.0 | |
soupsieve==2.3.1 | |
toml==0.10.2 | |
tqdm==4.62.3 | |
urllib3==1.26.7 | |
""" | |
from os import mkdir | |
from os.path import isdir | |
from bs4 import BeautifulSoup | |
from requests import get | |
from time import sleep | |
from json import loads | |
from shutil import copyfileobj | |
from tqdm import tqdm | |
from threading import Thread | |
def get_page_number(url): | |
response = get(url.format(1)) | |
html = BeautifulSoup(response.text, 'lxml') | |
page_number = html.select( | |
'div.c-pager>ul>li.js-pagination__item')[-1].find('a').get('data-page') | |
return page_number | |
def scrap_product(url): | |
img_links = [] | |
try: | |
response = get(url) | |
except: | |
sleep(2) | |
return scrap_product(url) | |
if response.status_code != 200: | |
sleep(2) | |
return scrap_product(url) | |
html = BeautifulSoup(response.text, 'lxml') | |
imgs = html.find_all( | |
'div', class_='c-remodal-gallery__thumb js-image-thumb') | |
for img in imgs: | |
if img: | |
img_links.append(img.find('img').get('data-src').split('?')[0]) | |
return img_links | |
def scrap_page(n, url): | |
digi_link = 'https://digikala.com{}' | |
if not isdir('scrap-img/{}'.format(n)): | |
mkdir('scrap-img/{}'.format(n)) | |
try: | |
response = get(url) | |
except: | |
sleep(2) | |
return scrap_page(n, url) | |
if response.status_code != 200: | |
sleep(2) | |
return scrap_page(n, url) | |
html = BeautifulSoup(response.text, 'lxml') | |
products = html.find_all( | |
'div', class_='c-product-box') | |
progress_bar = tqdm(desc='page {}'.format(n), total=len( | |
products), ncols=len(products), dynamic_ncols=True, position=int(n)) | |
for product in products: | |
details = loads(str(product.get('data-enhanced-ecommerce'))) | |
if not isdir('scrap-img/{}/{}'.format(n, details['id'])): | |
mkdir('scrap-img/{}/{}'.format(n, details['id'])) | |
link = product.find( | |
'a', class_='c-product-box__img').get('href') | |
photos = scrap_product(digi_link.format(link)) | |
for photo in photos: | |
res = get(photo, stream=True) | |
with open('scrap-img/{}/{}/{}'.format(n, details['id'], photo.split('/')[-1]), 'wb') as f: | |
copyfileobj(res.raw, f) | |
with open('scrap-img/{}/{}/details.txt'.format(n, details['id']), 'w', encoding="utf-8") as f: | |
for key in details: | |
f.write('{}:{}\n'.format(key, details[key])) | |
f.write('link:{}\n'.format(link)) | |
for photo in photos: | |
f.write('photo_link:{}\n'.format(photo)) | |
progress_bar.update(1) | |
sleep(0.8) | |
if __name__ == '__main__': | |
if not isdir('scrap-img'): | |
mkdir('scrap-img') | |
url = "https://www.digikala.com/treasure-hunt/products/?pageno={}" | |
page_number = get_page_number(url) | |
pages = [(i, url.format(i)) for i in range(1, int(page_number) + 1)] | |
threads = [] | |
for p in pages: | |
threads.append(Thread(target=scrap_page, args=p, name='page {}'.format(p[0]))) | |
for th in threads: | |
th.start() | |
th.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
چطور میتونم تمام دسته بندی ها به همراه زیردسته ها - زیر زیر دسته ها و … با استفاده از api دیجیکالا لیست کنم؟
مثلا api دسته پوشاک https://api.digikala.com/v1/categories/apparel/
حالا خودش چهارتا زیر دسته داره و هر زیردسته دوباره چندتا زیر دسته
چون کلا 10-12 تا دسته بندی اصلی هست میخوام با وارد کردن دسته اصلی تمام زیر دسته ها و زیرزیردسته ها و … بهمراه لینکشون برام لیست کنه