Skip to content

Instantly share code, notes, and snippets.

@pandada8
Created October 26, 2014 05:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pandada8/8d0dab2c336bfa279ea5 to your computer and use it in GitHub Desktop.
Save pandada8/8d0dab2c336bfa279ea5 to your computer and use it in GitHub Desktop.
it-ebooks.info
import sys
import requests
from bs4 import BeautifulSoup
import sys
import time
import concurrent.futures
import random
import json
import os
import re
'''
下载目录默认放在当前目录的`p`目录下,
会在当前目录生成一个.lst文件,可作为aria2c的输入文件使用。
'''
DOWNLOAD_PATH = 'p'
def find_max():
html = requests.get('http://it-ebooks.info/').text
result = re.findall(r'(?<=book/)\d+(?=/)', html)
return max([int(i) for i in set(result)])
def find_exist():
ret = 0
if os.path.exists(DOWNLOAD_PATH):
files = os.listdir(DOWNLOAD_PATH)
ret = max(int(i.split('-')[0]) for i in files if i[0].isdigit())
else:
os.mkdir(DOWNLOAD_PATH)
return ret
def update():
latest = find_max()
existed = find_exist()
Results = []
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures_to_url = (executor.submit(extract_info,_id) for _id in range(latest,existed,-1))
for future in concurrent.futures.as_completed(futures_to_url):
result = future.result()
Results.append(result)
print("{}\t{}".format(result['url'],'Fail' if result['err'] else 'Success'))
finally:
dump(Results)
def dump(result):
filename = "{:.0f}.lst".format(time.time())
with open(filename,'w') as fp:
print('Dump links to {}, Use follow command to download \naria2c -i {}'.format(filename,os.path.abspath(fp.name)))
for i in result:
if i['err'] == None:
fp.write('{}\n out={}\n header=Referer: {}\n'.format(i['dlink'],i['name'],i['url']))
else:
print('{} Err:{}'.format(i['url'],i['err']))
def extract_info(_id):
url = 'http://it-ebooks.info/book/{}/'.format(_id)
req = requests.get(url)
time.sleep(random.random()*2)
if req.url != 'http://it-ebooks.info/404/':
soup = BeautifulSoup(req.text)
try:
fileformat = soup.select('[itemprop=bookFormat]')[0].text.lower()
filename = soup.select('table tr:nth-of-type(16) a')[0].text
fileurl = soup.select('table tr:nth-of-type(16) a')[0].attrs.get('href')
return {
'url':url,
'name':"{}-{}.{}".format(_id,filename,fileformat).replace('/','%2f'),
'dlink':fileurl,
'err':None
}
except AttributeError:
sys.stderr.write(url)
return {'err':'incomplete','url':url}
else:
return {'err':'404','url':url}
if __name__ == '__main__':
update()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment