Created
May 10, 2013 02:17
-
-
Save crosslife/5551984 to your computer and use it in GitHub Desktop.
简单的校园网IEEE下载模块,输入论文链接中的ID自动尝试登录下载(只在校园网有效)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Created on 2013-5-10 | |
@author: crosslife | |
''' | |
from urllib import urlopen | |
import urllib2 | |
import cookielib | |
import re | |
import time | |
import thread | |
import threading | |
NAMELIST=[] | |
def getAbstract(paper_id): | |
requrl = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=" + str(paper_id) | |
try: | |
paperPage = urlopen(requrl) | |
paperText = paperPage.read() | |
except Exception: | |
paperText = "" | |
title = re.findall('<h1>\s*(.+?)\s*</h1>',paperText,re.IGNORECASE) | |
abs = re.findall('<h2>Abstract</h2>\s*</a>\s*<p>(.+?)</p>',paperText,re.IGNORECASE) | |
print abs #debug | |
if title == [] or abs == []:#network exception | |
return "empty###empty" | |
abs[0] = title[0] + "###" + abs[0] | |
return abs[0] | |
#check for auth, if success,save to cookiefile | |
def authCheck(cookiefile): | |
cookies = cookielib.MozillaCookieJar(cookiefile) | |
cookiehand = urllib2.HTTPCookieProcessor(cookies) | |
opener = urllib2.build_opener(cookiehand) | |
opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"), | |
("Host","ieeexplore.ieee.org")] | |
try: | |
opener.open("http://ieeexplore.ieee.org/xpl/conferences.jsp") | |
except: | |
print "network error" | |
return False | |
for item in cookies: | |
if item.name == "xploreCookies": | |
print "auth success!" | |
cookies.save(cookiefile, ignore_discard=True, ignore_expires=True) | |
return True | |
return False | |
def downWithCookies(cookiefile,paper_id): | |
abs = getAbstract(paper_id) | |
if(abs == "empty###empty"):#mostly caused by network error | |
print "None Abstract :%d ,retrying..." % int(paper_id) | |
count = 1 | |
while abs == "empty###empty": | |
abs = getAbstract(paper_id) | |
count += 1 | |
if count > 5: | |
print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract | |
break | |
cookies = cookielib.MozillaCookieJar(cookiefile) | |
cookies.load(cookiefile, ignore_discard=True, ignore_expires=True) | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies)) | |
page = opener.open("http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=" + str(paper_id)) | |
pagetext = page.read() | |
matchGroup = re.findall('<frame src="(http.*?)"',pagetext,re.IGNORECASE) | |
try: | |
pdflink = matchGroup[0] | |
except IndexError: | |
print "cookies outdated" | |
return False | |
pdfpage = opener.open(pdflink) | |
pdftext = pdfpage.read() | |
f = open(str(paper_id)+".pdf","wb") | |
f.write(pdftext) | |
f.close() | |
return True | |
def timingDown(seconds,paper_id): | |
time.sleep(seconds) | |
print "Starting download..." | |
count = 1 | |
while True: | |
print "trying to loging %d" % count | |
if authCheck("cookies.txt") == True: | |
print "auth success!" | |
break | |
count += 1 | |
time.sleep(5) | |
if downWithCookies("cookies.txt",paper_id): | |
print "download success!" | |
return True | |
return False | |
def threadDown(seconds,paper_id): | |
thread.start_new_thread(timingDown,(seconds,paper_id)) | |
def test(): | |
paper_id = raw_input("Input the id of the paper:") | |
timingDown(0, paper_id) | |
if __name__ == '__main__':test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment