Skip to content

Instantly share code, notes, and snippets.

@dot1mav
Last active December 29, 2023 13:23
Show Gist options
  • Save dot1mav/22106a5e01de412d65963687c81bb6aa to your computer and use it in GitHub Desktop.
Save dot1mav/22106a5e01de412d65963687c81bb6aa to your computer and use it in GitHub Desktop.
جمع آوری کلیه عکس های شکار گنج دیجی کالا #دیجیکالا #شکار_گنچ #شکار_گنج_دیجیکالا #treasure-hunt https://www.digikala.com/landings/treasure-hunt/
"""
write by dot1mav
req:
autopep8==1.6.0
beautifulsoup4==4.10.0
certifi==2021.10.8
charset-normalizer==2.0.8
colorama==0.4.4
commonmark==0.9.1
idna==3.3
lxml==4.6.4
pycodestyle==2.8.0
Pygments==2.10.0
requests==2.26.0
rich==10.14.0
soupsieve==2.3.1
toml==0.10.2
tqdm==4.62.3
urllib3==1.26.7
"""
from os import mkdir
from os.path import isdir
from bs4 import BeautifulSoup
from requests import get
from time import sleep
from json import loads
from shutil import copyfileobj
from tqdm import tqdm
from threading import Thread
def get_page_number(url):
response = get(url.format(1))
html = BeautifulSoup(response.text, 'lxml')
page_number = html.select(
'div.c-pager>ul>li.js-pagination__item')[-1].find('a').get('data-page')
return page_number
def scrap_product(url):
img_links = []
try:
response = get(url)
except:
sleep(2)
return scrap_product(url)
if response.status_code != 200:
sleep(2)
return scrap_product(url)
html = BeautifulSoup(response.text, 'lxml')
imgs = html.find_all(
'div', class_='c-remodal-gallery__thumb js-image-thumb')
for img in imgs:
if img:
img_links.append(img.find('img').get('data-src').split('?')[0])
return img_links
def scrap_page(n, url):
digi_link = 'https://digikala.com{}'
if not isdir('scrap-img/{}'.format(n)):
mkdir('scrap-img/{}'.format(n))
try:
response = get(url)
except:
sleep(2)
return scrap_page(n, url)
if response.status_code != 200:
sleep(2)
return scrap_page(n, url)
html = BeautifulSoup(response.text, 'lxml')
products = html.find_all(
'div', class_='c-product-box')
progress_bar = tqdm(desc='page {}'.format(n), total=len(
products), ncols=len(products), dynamic_ncols=True, position=int(n))
for product in products:
details = loads(str(product.get('data-enhanced-ecommerce')))
if not isdir('scrap-img/{}/{}'.format(n, details['id'])):
mkdir('scrap-img/{}/{}'.format(n, details['id']))
link = product.find(
'a', class_='c-product-box__img').get('href')
photos = scrap_product(digi_link.format(link))
for photo in photos:
res = get(photo, stream=True)
with open('scrap-img/{}/{}/{}'.format(n, details['id'], photo.split('/')[-1]), 'wb') as f:
copyfileobj(res.raw, f)
with open('scrap-img/{}/{}/details.txt'.format(n, details['id']), 'w', encoding="utf-8") as f:
for key in details:
f.write('{}:{}\n'.format(key, details[key]))
f.write('link:{}\n'.format(link))
for photo in photos:
f.write('photo_link:{}\n'.format(photo))
progress_bar.update(1)
sleep(0.8)
if __name__ == '__main__':
if not isdir('scrap-img'):
mkdir('scrap-img')
url = "https://www.digikala.com/treasure-hunt/products/?pageno={}"
page_number = get_page_number(url)
pages = [(i, url.format(i)) for i in range(1, int(page_number) + 1)]
threads = []
for p in pages:
threads.append(Thread(target=scrap_page, args=p, name='page {}'.format(p[0])))
for th in threads:
th.start()
th.join()
@uzer0098
Copy link

چطور میتونم تمام دسته بندی ها به همراه زیردسته ها - زیر زیر دسته ها و … با استفاده از api دیجیکالا لیست کنم؟

مثلا api دسته پوشاک https://api.digikala.com/v1/categories/apparel/

حالا خودش چهارتا زیر دسته داره و هر زیردسته دوباره چندتا زیر دسته

چون کلا 10-12 تا دسته بندی اصلی هست میخوام با وارد کردن دسته اصلی تمام زیر دسته ها و زیرزیردسته ها و … بهمراه لینکشون برام لیست کنه

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment