Created
October 26, 2014 05:20
-
-
Save pandada8/8d0dab2c336bfa279ea5 to your computer and use it in GitHub Desktop.
it-ebooks.info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
from bs4 import BeautifulSoup | |
import sys | |
import time | |
import concurrent.futures | |
import random | |
import json | |
import os | |
import re | |
''' | |
下载目录默认放在当前目录的`p`目录下, | |
会在当前目录生成一个.lst文件,可作为aria2c的输入文件使用。 | |
''' | |
DOWNLOAD_PATH = 'p' | |
def find_max(): | |
html = requests.get('http://it-ebooks.info/').text | |
result = re.findall(r'(?<=book/)\d+(?=/)', html) | |
return max([int(i) for i in set(result)]) | |
def find_exist(): | |
ret = 0 | |
if os.path.exists(DOWNLOAD_PATH): | |
files = os.listdir(DOWNLOAD_PATH) | |
ret = max(int(i.split('-')[0]) for i in files if i[0].isdigit()) | |
else: | |
os.mkdir(DOWNLOAD_PATH) | |
return ret | |
def update(): | |
latest = find_max() | |
existed = find_exist() | |
Results = [] | |
try: | |
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | |
futures_to_url = (executor.submit(extract_info,_id) for _id in range(latest,existed,-1)) | |
for future in concurrent.futures.as_completed(futures_to_url): | |
result = future.result() | |
Results.append(result) | |
print("{}\t{}".format(result['url'],'Fail' if result['err'] else 'Success')) | |
finally: | |
dump(Results) | |
def dump(result): | |
filename = "{:.0f}.lst".format(time.time()) | |
with open(filename,'w') as fp: | |
print('Dump links to {}, Use follow command to download \naria2c -i {}'.format(filename,os.path.abspath(fp.name))) | |
for i in result: | |
if i['err'] == None: | |
fp.write('{}\n out={}\n header=Referer: {}\n'.format(i['dlink'],i['name'],i['url'])) | |
else: | |
print('{} Err:{}'.format(i['url'],i['err'])) | |
def extract_info(_id): | |
url = 'http://it-ebooks.info/book/{}/'.format(_id) | |
req = requests.get(url) | |
time.sleep(random.random()*2) | |
if req.url != 'http://it-ebooks.info/404/': | |
soup = BeautifulSoup(req.text) | |
try: | |
fileformat = soup.select('[itemprop=bookFormat]')[0].text.lower() | |
filename = soup.select('table tr:nth-of-type(16) a')[0].text | |
fileurl = soup.select('table tr:nth-of-type(16) a')[0].attrs.get('href') | |
return { | |
'url':url, | |
'name':"{}-{}.{}".format(_id,filename,fileformat).replace('/','%2f'), | |
'dlink':fileurl, | |
'err':None | |
} | |
except AttributeError: | |
sys.stderr.write(url) | |
return {'err':'incomplete','url':url} | |
else: | |
return {'err':'404','url':url} | |
if __name__ == '__main__': | |
update() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment