Skip to content

Instantly share code, notes, and snippets.

@crosslife
Created December 6, 2012 08:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save crosslife/4222730 to your computer and use it in GitHub Desktop.
Save crosslife/4222730 to your computer and use it in GitHub Desktop.
校园网IEEE论文下载系统后台
'''
Created on 2012-3-7
@author: GeZiyang
'''
from urllib import urlopen
import urllib2
import cookielib
import re
import time
import thread
import threading
NAMELIST=[]
#get search by shortName result ids
def getLinks(shortName):
searchInterface = "http://ieeexplore.ieee.org/xpl/conferences.jsp?queryText="
try:
resultPage = urlopen(searchInterface + shortName)
except:
return False
resultText = resultPage.read()
matchGroup = re.findall('/xpl/conhome\.jsp\?punumber=(\d+)',resultText,re.IGNORECASE)
return matchGroup #list
#get search by shortName result full name list
def getConfName(shortName):
global NAMELIST
list = getLinks(shortName)
if(list == False):
return False
if(list == None):
print("No matched conference!")
return []
else:
namelist = []
for id in list:
try:
confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + id)
except:
return False
confText = confPage.read()
matchGroup = re.findall('<h1>(.*?)</h1>',confText,re.IGNORECASE)
namelist.append((matchGroup[0],id))
NAMELIST=namelist
return namelist
#get all about the conference in history
def getHisLinks(result_id):
try:
confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + str(result_id))
except:
return False
confText = confPage.read()
conf_ids = re.findall('mostRecentIssue\.jsp\?punumber=(\d+)',confText,re.IGNORECASE)
return conf_ids
#
def getPaperLinks(conf_id):
#first request to get element oqs
firstreq = "http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=" + str(conf_id)
try:
firstPage = urlopen(firstreq)
except:
return False
firstText = firstPage.read()
oqs = re.findall('id=\"oqs\"\s*value=\"(.*?)\"',firstText,re.IGNORECASE)
secondreq = "http://ieeexplore.ieee.org/xpl/tocresult.jsp?" + oqs[0] + "&rowsPerPage=1000&pageNumber=1" #suppose the number of paper is less than 1000
try:
paperListPage = urlopen(secondreq)
except:
return False
paperText = paperListPage.read()
paperlist = re.findall('/xpl/articleDetails\.jsp\?tp=&arnumber=(\d+)',paperText,re.IGNORECASE)
paperset = set(paperlist)
paperlist = list(paperset) #delete the same
return paperlist #list
#place to catch exception
def getAbstract(paper_id):
requrl = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=" + str(paper_id)
try:
paperPage = urlopen(requrl)
paperText = paperPage.read()
except Exception:
paperText = ""
title = re.findall('<h1>\s*(.+?)\s*</h1>',paperText,re.IGNORECASE)
abs = re.findall('<h2>Abstract</h2>\s*</a>\s*<p>(.+?)</p>',paperText,re.IGNORECASE)
print abs #debug
if title == [] or abs == []:#network exception
return "empty###empty"
abs[0] = title[0] + "###" + abs[0]
return abs[0]
def getNameById(conf_home):
try:
confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + str(conf_home))
except:
return False
confText = confPage.read()
matchGroup = re.findall('<h1>(.*?)</h1>',confText,re.IGNORECASE)
return matchGroup[0]
def buildNewDb(conf_home):
absDb = {} #dic
conf_name = getNameById(conf_home)
count = 1
while conf_name == False:
conf_name = getNameById(conf_home)
count += 1
if count > 5: # try to get full name 5 times
return False
conf_all = getHisLinks(conf_home)
count = 1
while conf_all == False:
conf_all = getHisLinks(conf_home)
count += 1
if count > 5: # try to getHisLinks 5 times
return False
paper_all = []
for conf_id in conf_all:
paper_ids = getPaperLinks(conf_id)
count = 1
while paper_ids == False:
paper_ids = getPaperLinks(conf_id)
count += 1
if count > 5: # try to getPaperLinks 5 times
return False
paper_all += paper_ids
for paper_id in paper_all:
abs = getAbstract(paper_id)
if(abs == "empty###empty"):#mostly caused by network error
print "None Abstract :%d ,retrying..." % int(paper_id)
count = 1
while abs == "empty###empty":
abs = getAbstract(paper_id)
count += 1
if count > 5:
print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract
break
absDb[paper_id] = abs
print "%d:%s" % (int(paper_id),abs) #debug
print "start writing file..." #debug
file = open("confs/"+ str(conf_home),'w')
for conf_id in conf_all:
file.write(conf_id + " ")
file.write("\n")
for paper_id in paper_all:
if(absDb[paper_id] == "empty###empty"):
continue
file.write(paper_id + "\n")
file.write(absDb[paper_id] + "\n")
file.close()
indexfile = open("confs/confindex",'a')
indexfile.write(str(conf_home) + '\n')
indexfile.write(str(conf_name) + '\n')
indexfile.close()
# add write to index
print "finished."
return True
def getFileDb(conf_home):
absDb = {}
try:
file = open("confs/" + str(conf_home))
file.readline()
while True:
line = file.readline()
if not line:break
absDb[line.split()[0]] = file.readline()
file.close()
return absDb
except Exception:
print "No database file"
return False
def checkUpdate(conf_home):
conf_all = getHisLinks(conf_home)
try:
file = open("confs/" + str(conf_home))
conflist = file.readline().split()
if(set(conf_all) == set(conflist)):
return True #can use
else:
return False # need for update
except:
print "no database yet"
return False
def searchPaper(conf_id,keylist):
print keylist
time.sleep(5)
resultdic = {}
absDb = getFileDb(conf_id)
if keylist == []:
return absDb
for paper_id,title_abs in absDb.items():
match = True
try:
for key in keylist:
if title_abs.find(key) >= 0:
continue
else:
match = False
break
if match:
resultdic[paper_id] = title_abs
print paper_id
print resultdic[paper_id]
except:
continue
return resultdic
#check for auth, if success,save to cookiefile
def authCheck(cookiefile):
cookies = cookielib.MozillaCookieJar(cookiefile)
cookiehand = urllib2.HTTPCookieProcessor(cookies)
opener = urllib2.build_opener(cookiehand)
opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"),
("Host","ieeexplore.ieee.org")]
try:
opener.open("http://ieeexplore.ieee.org/xpl/conferences.jsp")
except:
print "network error"
return False
for item in cookies:
if item.name == "xploreCookies":
print "auth success!"
cookies.save(cookiefile, ignore_discard=True, ignore_expires=True)
return True
return False
def downWithCookies(cookiefile,paper_id):
abs = getAbstract(paper_id)
if(abs == "empty###empty"):#mostly caused by network error
print "None Abstract :%d ,retrying..." % int(paper_id)
count = 1
while abs == "empty###empty":
abs = getAbstract(paper_id)
count += 1
if count > 5:
print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract
break
cookies = cookielib.MozillaCookieJar(cookiefile)
cookies.load(cookiefile, ignore_discard=True, ignore_expires=True)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies))
page = opener.open("http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=" + str(paper_id))
pagetext = page.read()
matchGroup = re.findall('<frame src="(http.*?)"',pagetext,re.IGNORECASE)
try:
pdflink = matchGroup[0]
except IndexError:
print "cookies outdated"
return False
pdfpage = opener.open(pdflink)
pdftext = pdfpage.read()
f = open("papers/" + str(paper_id)+".pdf","wb")
f.write(pdftext)
f.close()
index = open("papers/paperindex","a")
index.write(str(paper_id) + "\n")
index.write(str(abs) + "\n")
index.close()
return True
def getConfIndex():
confIndex = {}
try:
file = open("confs/confindex")
while True:
line = file.readline()
if not line:break
confIndex[line.split()[0]] = file.readline()
file.close()
return confIndex
except Exception:
print "No conference index file"
return False
def getPaperIndex():
paperIndex = {}
try:
file = open("papers/paperindex")
while True:
line = file.readline()
if not line:break
paperIndex[line.split()[0]] = file.readline()
file.close()
return paperIndex
except Exception:
print "No paper index file"
return False
def timingDown(seconds,paper_id):
time.sleep(seconds)
print "Starting download..."
count = 1
while True:
print "trying %d" % count
if authCheck("cookies.txt") == True:
print "auth success!"
break
count += 1
time.sleep(5)
if downWithCookies("cookies.txt",paper_id):
print "download success!"
return True
return False
def threadDown(seconds,paper_id):
thread.start_new_thread(timingDown,(seconds,paper_id))
def test():
buildNewDb(1001307)
#threadDown(100,5279450)
#downTest()
#print u"test"
#print "test".encode('utf-8')
'''
paper_id = raw_input("Input the id of the paper:")
count = 1
while True:
print "trying %d" % count
if authCheck("cookies.txt") == True:
print "auth success!"
break
count += 1
time.sleep(5)
if downWithCookies("cookies.txt",paper_id):
print "download success!"
buildNewDb(1001545)
time.sleep(300)
'''
pass
if __name__ == '__main__':test()
@crosslife
Copy link
Author

test

@xxling
Copy link

xxling commented May 8, 2013

will try it. yeah, diao zha sky.

@saxon17
Copy link

saxon17 commented Feb 26, 2017

非校园网可以下载么

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment