Last active
August 26, 2022 17:52
-
-
Save greatghoul/5503506 to your computer and use it in GitHub Desktop.
dm5.com 漫画批量下载
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 | |
""" | |
requirements: | |
PyExecJS | |
requests | |
""" | |
import sys, os, re, requests, execjs | |
def get_page_list(content): | |
print ' Fetching page list ...' | |
page_list = [] | |
pat_select = r'<select[^>]+id="pagelist"[^>]+>(.*?)<\/select>' | |
pat_option = r'<option[^>]+value=\'(.*?)\'>' | |
match = re.search(pat_select, content, flags=re.I|re.M|re.X) | |
if match: | |
for item in re.finditer(pat_option, match.group(0), flags=re.I|re.M|re.X): | |
page_list.append(item.group(1)) | |
print ' %d pages in total.' % len(page_list) | |
return page_list | |
def get_page_title(content): | |
pat_title = r'<h1>(.*?)<\/h1>' | |
match = re.search(pat_title, content, flags=re.I|re.M|re.X) | |
return match and match.group(1) or None | |
# 网站通过 ajax 返回脚本动态组装图片地址 | |
# http://www.dm5.com/m114927/chapterimagefun.ashx?cid=114927&page=1&language=1&key= | |
def download(url): | |
pat_img = r'<img[^>]+src="(.*?)"[^>]+id="cp_image"[^>]+>' | |
pat_url = r'http:\/\/www\.dm5\.com\/m(\d+)-p(\d+)\/' | |
pat_key = r'\|([a-z0-9A-Z]{32})\|' | |
print ' Fetching:', url | |
match = re.search(pat_url, url) | |
cid = match.group(1) | |
page = match.group(2) | |
key_url = 'http://www.dm5.com/m%s/chapterimagefun.ashx?cid=%s&page=%s&language=1&key=' % (cid, cid, page) | |
print key_url | |
content = requests.get(key_url, headers={'Referer': url}).text | |
print execjs.eval(content) | |
print content | |
match = re.search(pat_key, content, flags=re.I|re.M|re.X) | |
key = match and match.group(1) or None | |
print cid, page, key | |
def download_all(url): | |
pat_url = r'http:\/\/www\.dm5\.com\/m\d+(-p\d+)?\/' | |
if re.match(pat_url, url): | |
print 'Fetching commics from:', url | |
content = requests.get(url).content | |
page_title = get_page_title(content) | |
if page_title: | |
print ' Title:', page_title | |
page_list = get_page_list(content) | |
for page in page_list: | |
download('http://www.dm5.com' + page) | |
break | |
else: | |
print 'Error: commit not found!' | |
else: | |
print 'Not a valid dm5 url:', url | |
if __name__ == '__main__': | |
OUT_DIR = 'output' | |
if not os.path.exists('output'): | |
os.mkdir(OUT_DIR) | |
download_all('http://www.dm5.com/m114927/') | |
if len(sys.argv) > 1: | |
download_all(sys.argv[1]) | |
else: | |
print 'Usage: crawler_dm5_com.py <url>' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment