Skip to content

Instantly share code, notes, and snippets.

@tigerwang202
Last active August 29, 2015 14:24
Show Gist options
  • Save tigerwang202/c656d12b87fffa9cc351 to your computer and use it in GitHub Desktop.
Save tigerwang202/c656d12b87fffa9cc351 to your computer and use it in GitHub Desktop.
Fetch ebooks on 61eda website. Need beautifulSoup4 wget
# -*- coding: utf-8 -*-
import sys
import requests
import urlparse
import os
import time
from bs4 import BeautifulSoup
agent_string = '''"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; \
Acoo Browser 1.98.744; .NET CLR 3.5.30729)"'''
url_base = "http://www.61eda.com/"
# 下载文件
# Fix:跳过已下载文件(除非服务器上有更新的文件)
def downloadfile(url):
os.system("wget --tries=15 "
"--retry-connrefused "
"--waitretry=8 "
"--connect-timeout=5 "
"--wait=3 "
"--timeout=120 "
"-N "
"--user-agent=%s \"%s\"" % (agent_string, url.encode('gbk')))
return
def main():
# 解析命令行输入
if len(sys.argv) != 2:
print("fetch ebook from 61eda.com")
print("usage: fetch61eda.py {page url}")
print("for example: http://www.61eda.com/Soft/book/cnbook/")
exit()
url = sys.argv[1]
if not url.startswith("http://"):
url = "http://" + url
while True:
# 下载网页
try:
print("fetch from: %s" % url)
r = requests.get(url)
# 网页使用gb2312编码,beautifulSoup只支持Unicode编码,需要转换
# Fix: 跳过无法解码内容
soup = BeautifulSoup(r.content.decode('gbk', 'ignore'))
pg_num = soup.find("font", color="FF0000").text
print("title: %s,\npage: %s" % (soup.title.text, pg_num))
except Exception as e:
print(e)
print(" error, retry after 1 sec...")
time.sleep(1)
continue
# 跳过已下载目录
if not os.path.exists(os.path.join(os.getcwd(), pg_num)):
# 查找电子书下载页面链接
# TODO: 这里处理的有点简单,后续待改进
for page in soup.find_all(attrs={"align": "left", "width": "284"}):
url = urlparse.urljoin(url_base, page.a['href'])
print("download : %s" % url)
parsepage(url)
# 对当前页归档
os.system("mkdir \"%s\"" % pg_num)
os.system("move *.rar \"%s\"" % pg_num)
else:
print("existing folder %s,skip..." % pg_num)
# 查找下一页
next_page = soup.find("a", text=u"下一页")
if next_page is None:
print("Done!")
break
url = urlparse.urljoin(url_base, next_page['href'])
# 解析下载页面
def parsepage(url):
r = requests.get(url)
# Fix:跳过无法解码内容
s = BeautifulSoup(r.content.decode('gbk', 'ignore'))
print("download book: %s" % s.title.text)
link = s.find("a", text=u"下载地址1")
pdf = urlparse.urljoin(url_base, link['href'])
print("url: %s" % pdf)
downloadfile(pdf)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment