Created
June 21, 2012 14:11
-
-
Save AyumuKasuga/2965964 to your computer and use it in GitHub Desktop.
save images from 2ch.so
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
#coding: utf-8 | |
import gevent.monkey | |
gevent.monkey.patch_socket() | |
import sys | |
import os | |
from urlparse import urlparse | |
import urllib | |
from lxml.html import document_fromstring | |
import gevent | |
from gevent.queue import Queue | |
def parser_2ch(src): | |
doc = document_fromstring(src) | |
images = doc.xpath('//a/img[@class="img"]') | |
listimages = ['http://2ch.so' + i.getparent().attrib['href'] for i in images] | |
threadname = doc.xpath('//span[@class="filetitle"]')[0].text | |
return {'threaname': threadname, 'images': listimages} | |
def download_and_save(folder): | |
while not q.empty(): | |
link = q.get() | |
path = '%s/%s' % (folder, link.split('/')[-1]) | |
if os.path.exists(path): | |
#print link, 'already exists' | |
pass | |
else: | |
src = urllib.urlopen(link).read() | |
with open(path, 'w') as f: | |
f.write(src) | |
#print link, len(src), 'saved' | |
status_str = 'download: %s/%s' % (allimglen - q.qsize(), allimglen) | |
sys.stdout.write('\r') | |
sys.stdout.write(status_str) | |
sys.stdout.flush() | |
def folder_name(url): | |
s = url.split('/') | |
return "%s_%s_%s" % (s[-4], s[-3], s[-1].replace('.html', '')) | |
try: | |
srcurl = sys.argv[1] | |
except IndexError: | |
print 'use: saveimg.py url [folder]' | |
sys.exit() | |
try: | |
userfolder = sys.argv[2] | |
except IndexError: | |
userfolder = False | |
chans = { | |
'2ch.so': parser_2ch | |
} | |
threads = 5 | |
host = urlparse(srcurl).netloc | |
folder = userfolder or folder_name(srcurl) | |
try: | |
os.mkdir(folder) | |
except OSError as e: | |
pass | |
if host in chans.keys(): | |
src = urllib.urlopen(srcurl).read() | |
img = chans[host](src) | |
allimglen = len(img['images']) | |
q = Queue() | |
[q.put_nowait(i) for i in img['images']] | |
gevent.joinall([gevent.spawn(download_and_save, folder) for i in range(threads)]) | |
print '\ndone!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment