Skip to content

Instantly share code, notes, and snippets.

@qzane
Created September 20, 2017 16:54
Show Gist options
  • Save qzane/a4d3047107ec3352fb258b71b4f2e492 to your computer and use it in GitHub Desktop.
Save qzane/a4d3047107ec3352fb258b71b4f2e492 to your computer and use it in GitHub Desktop.
kukudm.com downloader
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 00:40:18 2017
@author: qzane
"""
import requests
from lxml import etree
import re
import os,sys
from multiprocessing import Pool
TIMEOUT = 10.0
RETRY = 5
WORKERS = 3
WORKER = Pool(WORKERS)
def http_get(url):
for i in range(RETRY):
try:
page = requests.get(url, timeout=TIMEOUT)
break
except Exception as e:
if i == RETRY-1:
raise(e)
return page
def get_commic(url='http://comic.kukudm.com/comiclist/2044/index.htm'):
assert('comic.kukudm.com' in url)
assert('comiclist' in url)
assert('index.htm' in url)
result = {'name':'', 'chapters':[]}
page = http_get(url)
page = page.content.decode('gbk')
tree = etree.HTML(page)
try:
name = tree.xpath('//title/text()')[0]
if '|' in name:
name = name.split('|')[0]
result['name'] = name
except:
print('name not found!')
try:
dds = tree.xpath('//dl[@id="comiclistn"]/dd')
for dd in dds:
chapter_name = dd.xpath('./a/text()')[0]
chapter_url = 'http://comic.kukudm.com'+ \
dd.xpath('./a/@href')[0]
result['chapters'].append((chapter_name, chapter_url))
except:
print('chapters err')
return result
def get_chapter(url='http://comic.kukudm.com/comiclist/2044/43474/1.htm'):
assert('comic.kukudm.com' in url)
assert('comiclist' in url)
assert(not 'index.htm' in url)
pages = []
while(1):
page = http_get(url).content.decode('gbk')
img = 'http://n.1whour.com/' + \
re.findall(r'newkuku.+?\.jpg',page)[0]
pages.append(img)
print('get page:'+str(len(pages)))
tree = etree.HTML(page)
url = 'http://comic.kukudm.com' + \
tree.xpath(r'//img[@src="/images/d.gif"]/parent::a/@href')[0]
if 'exit' in url:
break
return pages
def download_chapter(paras):
url, path = paras
pages = get_chapter(url)
for no, img in enumerate(pages):
page = http_get(img)
fname = os.path.join(path, str(no)+'.jpg')
with open(fname, 'wb') as f:
f.write(page.content)
print(fname)
def main():
if len(sys.argv) >= 2:
url = sys.argv[1]
else:
url = 'http://comic.kukudm.com/comiclist/2044/index.htm'
if len(sys.argv) >= 3:
path = sys.argv[2]
else:
path = '.'
commic = get_commic(url)
print('name:', commic['name'])
path = os.path.join(path, commic['name'])
if not os.path.isdir(path):
os.mkdir(path)
jobs = []
for chapter in commic['chapters']:
c_path = os.path.join(path, chapter[0])
if not os.path.isdir(c_path):
os.mkdir(c_path)
jobs.append((chapter[1],c_path))
tuple(map(download_chapter, jobs))
#WORKER.map(download_chapter, jobs)
#total = len(jobs)
#for no,_ in enumerate(WORKER.imap_unordered(download_chapter, jobs)):
# print('Done %d/%d: %.2f%%'%(no+1, total, float(no+1)/total*100))
if __name__ == '__main__':
main()
@qzane
Copy link
Author

qzane commented Sep 20, 2017

when using WORKER.imap_unordered, the first three jobs will get error saying:
AttributeError: Can't get attribute 'download_chapter' on <module 'main' from 'kukudm.py'>

Someone can tell me why?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment