Skip to content

Instantly share code, notes, and snippets.

@Xorcerer
Created April 26, 2021 13:46
Show Gist options
  • Save Xorcerer/2f20c0f278e1f397fae8b1816c1ebb68 to your computer and use it in GitHub Desktop.
Save Xorcerer/2f20c0f278e1f397fae8b1816c1ebb68 to your computer and use it in GitHub Desktop.
import asyncio
import os
import re
import socket
import socks
import sys
import urllib.request
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
PROXY_HOST = 'localhost'
PROXY_PORT = 7777
DEST_ROOT = r'D:\Books\Comics'
title_re = re.compile('<h1>(.*?)</h1>', re.IGNORECASE)
image_re = re.compile('Large_cgurl\[\d+\] = "(.*?)"')
def crawl_comic(content):
socks.set_default_proxy(socks.SOCKS5, PROXY_HOST, PROXY_PORT)
socket.socket = socks.socksocket
m = title_re.search(content)
print(m.group(1))
dirname = os.path.join(DEST_ROOT, m.group(1))
if not os.path.exists(dirname):
os.mkdir(dirname)
for img in image_re.finditer(content):
print('downloading:', img.group(1))
url = img.group(1)
urllib.request.urlretrieve(url, os.path.join(dirname, url.split('/')[-1]))
async def download(url):
browser = await launch({'args': ['--proxy-server=%s:%d' % (PROXY_HOST, PROXY_PORT)], 'headless': True })
page = await browser.newPage()
async def get_html():
i = 20
while i > 0:
try:
await page.goto(url, waitUntil='domcontentloaded')
return await page.content()
except TimeoutError:
print('timeout while loading:', url)
i -= 1
content = await get_html()
crawl_comic(content)
await browser.close()
def main():
if len(sys.argv) > 1:
url = sys.argv[1]
else:
url = None
while not url:
url = input('url:')
asyncio.get_event_loop().run_until_complete(download(url))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment