Skip to content

Instantly share code, notes, and snippets.

@fireflyc
Last active November 2, 2016 13:06
Show Gist options
  • Save fireflyc/ba94b3498074e54f7446 to your computer and use it in GitHub Desktop.
Save fireflyc/ba94b3498074e54f7446 to your computer and use it in GitHub Desktop.
《计算机学报》抓取脚本
import urllib, urllib2, os, traceback, time
from bs4 import BeautifulSoup
root = "http://cjc.ict.ac.cn"
book_root = "/Users/fireflyc/Downloads/"
def post_http(url, param=None, charset=None):
data = None
if(param!=None):
data = urllib.urlencode(param)
req = urllib2.Request(url)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
response = opener.open(req, data)
if charset:
return response.read().decode(charset, errors="ignore")
return response.read()
def download_pdf(a):
html = post_http(root + "/qwjs/" + a.attrs["href"], charset="gbk")
soup = BeautifulSoup(html, from_encoding="gbk")
a = soup.find("a")
description = soup.find_all("td", {"style":"FONT-SIZE: 8.8pt"})
title = description[1].get_text().strip()
#author = description[3].get_text().strip()
year = description[7].get_text().strip()
month = description[9].get_text().strip()
month = month[:month.find("(")]
file_name = "%s.pdf" % (title)
file_name = file_name.replace("?", "").replace(":", "").replace("/", "")
print "download pdf "
book_path = book_root + "%s/%s" % (year, month)
book_path = book_path.strip()
if(not os.path.exists(book_path)):
os.makedirs(book_path)
pdf_path = book_path + "/" +file_name
if(os.path.exists(pdf_path)):
print "exist"
return
fw = open(pdf_path, "wb")
fw.write(post_http(root + a.attrs["href"]))
fw.close()
print "download ok"
html = post_http(root + "/qwjs/" + "/list1.asp", {"pub_year":"2015"}, charset="gbk")
soup = BeautifulSoup(html, from_encoding="gbk")
tables = soup.find_all("table", {"width":"90%", "align":"center", "border":"0"})
for table in tables:
try:
download_pdf(table.find("a"))
except Exception,e:
print traceback.format_exc()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment