Skip to content

Instantly share code, notes, and snippets.

@vladimirgamalyan
Created July 23, 2015 11:30
Show Gist options
  • Save vladimirgamalyan/dd41bebb633346059d5c to your computer and use it in GitHub Desktop.
Save vladimirgamalyan/dd41bebb633346059d5c to your computer and use it in GitHub Desktop.
Grab fresh photos from xuk.ru
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from ghost import Ghost
import logging
import urllib
import os
def main():
print 'Download main page...'
ghost = Ghost(log_level=logging.ERROR)
with ghost.start(display=False, download_images=False) as session:
page, extra_resources = session.open('http://xuk.ru', timeout=100000)
assert page.http_status == 200
base_out_dir = 'output'
if not os.path.exists(base_out_dir):
os.makedirs(base_out_dir)
print 'Parsing main page...'
soup = BeautifulSoup(str(page.content), 'html5lib')
for d in soup.find('div', class_='category photoalbums').find_all('div', class_='item photo-item'):
link = d.a.get('href')
out_dir = os.path.join(base_out_dir, link.split('/')[-1])
if not os.path.exists(out_dir):
os.makedirs(out_dir)
print 'Retrieve model #' + out_dir
sub_soup = BeautifulSoup(requests.get(link).text, 'html5lib')
for image_div in sub_soup.find('div', class_='photo-items tiles').find_all('div', class_='photo-item'):
l = image_div.a.get('href')
sub_sub_soup = BeautifulSoup(requests.get(l).text, 'html5lib')
image_link = sub_sub_soup.find('div', class_='photo-container').img.get('src')
print image_link
urllib.urlretrieve(image_link, os.path.join(out_dir, image_link.split('/')[-1]))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment