crosslife/gist:5551993

## gistfile1.py
'''
Created on 2013-5-10

@author: crosslife
'''
from urllib import urlopen
import urllib2
import cookielib
import re
import time
import thread
import threading

NAMELIST=[]
def getAbstract(paper_id):
    requrl = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=" + str(paper_id)
    try:
        paperPage = urlopen(requrl)
        paperText = paperPage.read()
    except Exception:
        paperText = ""

    title = re.findall('<h1>\s*(.+?)\s*</h1>',paperText,re.IGNORECASE)
    abs = re.findall('<h2>Abstract</h2>\s*</a>\s*<p>(.+?)</p>',paperText,re.IGNORECASE)
    print abs #debug
    if title == [] or abs == []:#network exception
        return "empty###empty"
    abs[0] = title[0] + "###" + abs[0]
    return abs[0]

#check for auth, if success,save to cookiefile
def authCheck(cookiefile):
    cookies = cookielib.MozillaCookieJar(cookiefile)
    cookiehand  = urllib2.HTTPCookieProcessor(cookies)
    opener = urllib2.build_opener(cookiehand)
    opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"),
                         ("Host","ieeexplore.ieee.org")]
    try:
        opener.open("http://ieeexplore.ieee.org/xpl/conferences.jsp")
    except:
        print "network error"
        return False
    for item in cookies:
        if item.name == "xploreCookies":
            print "auth success!"
            cookies.save(cookiefile, ignore_discard=True, ignore_expires=True)
            return True
    return False

def downWithCookies(cookiefile,paper_id):
    abs = getAbstract(paper_id)
    if(abs == "empty###empty"):#mostly caused by network error
            print "None Abstract :%d ,retrying..." % int(paper_id)
            count = 1
            while abs == "empty###empty":
                abs = getAbstract(paper_id)
                count += 1
                if count > 5:
                    print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract
                    break
    cookies = cookielib.MozillaCookieJar(cookiefile)
    cookies.load(cookiefile, ignore_discard=True, ignore_expires=True)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies))
    page = opener.open("http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=" + str(paper_id))
    pagetext = page.read()
    matchGroup = re.findall('<frame src="(http.*?)"',pagetext,re.IGNORECASE)
    try:
        pdflink = matchGroup[0]
    except IndexError:
        print "cookies outdated"
        return False
    pdfpage = opener.open(pdflink)
    pdftext = pdfpage.read()
    f = open(str(paper_id)+".pdf","wb")
    f.write(pdftext)
    f.close()
    return True

def timingDown(seconds,paper_id):
    time.sleep(seconds)
    print "Starting download..."
    count = 1
    while True:
        print "trying to login %d" % count
        if authCheck("cookies.txt") == True:
            print "auth success!"
            break
        count += 1
        time.sleep(5)
    if downWithCookies("cookies.txt",paper_id):
        print "download success!"
        return True
    return False

def threadDown(seconds,paper_id):
    thread.start_new_thread(timingDown,(seconds,paper_id))

def test():
    paper_id = raw_input("Input the id of the paper:")
    timingDown(0, paper_id)


if __name__ == '__main__':test()
	'''
	Created on 2013-5-10

	@author: crosslife
	'''
	from urllib import urlopen
	import urllib2
	import cookielib
	import re
	import time
	import thread
	import threading

	NAMELIST=[]
	def getAbstract(paper_id):
	requrl = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=" + str(paper_id)
	try:
	paperPage = urlopen(requrl)
	paperText = paperPage.read()
	except Exception:
	paperText = ""

	title = re.findall('<h1>\s(.+?)\s</h1>',paperText,re.IGNORECASE)
	abs = re.findall('<h2>Abstract</h2>\s</a>\s<p>(.+?)</p>',paperText,re.IGNORECASE)
	print abs #debug
	if title == [] or abs == []:#network exception
	return "empty###empty"
	abs[0] = title[0] + "###" + abs[0]
	return abs[0]

	#check for auth, if success,save to cookiefile
	def authCheck(cookiefile):
	cookies = cookielib.MozillaCookieJar(cookiefile)
	cookiehand = urllib2.HTTPCookieProcessor(cookies)
	opener = urllib2.build_opener(cookiehand)
	opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"),
	("Host","ieeexplore.ieee.org")]
	try:
	opener.open("http://ieeexplore.ieee.org/xpl/conferences.jsp")
	except:
	print "network error"
	return False
	for item in cookies:
	if item.name == "xploreCookies":
	print "auth success!"
	cookies.save(cookiefile, ignore_discard=True, ignore_expires=True)
	return True
	return False

	def downWithCookies(cookiefile,paper_id):
	abs = getAbstract(paper_id)
	if(abs == "empty###empty"):#mostly caused by network error
	print "None Abstract :%d ,retrying..." % int(paper_id)
	count = 1
	while abs == "empty###empty":
	abs = getAbstract(paper_id)
	count += 1
	if count > 5:
	print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract
	break
	cookies = cookielib.MozillaCookieJar(cookiefile)
	cookies.load(cookiefile, ignore_discard=True, ignore_expires=True)
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies))
	page = opener.open("http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=" + str(paper_id))
	pagetext = page.read()
	matchGroup = re.findall('<frame src="(http.*?)"',pagetext,re.IGNORECASE)
	try:
	pdflink = matchGroup[0]
	except IndexError:
	print "cookies outdated"
	return False
	pdfpage = opener.open(pdflink)
	pdftext = pdfpage.read()
	f = open(str(paper_id)+".pdf","wb")
	f.write(pdftext)
	f.close()
	return True

	def timingDown(seconds,paper_id):
	time.sleep(seconds)
	print "Starting download..."
	count = 1
	while True:
	print "trying to login %d" % count
	if authCheck("cookies.txt") == True:
	print "auth success!"
	break
	count += 1
	time.sleep(5)
	if downWithCookies("cookies.txt",paper_id):
	print "download success!"
	return True
	return False

	def threadDown(seconds,paper_id):
	thread.start_new_thread(timingDown,(seconds,paper_id))

	def test():
	paper_id = raw_input("Input the id of the paper:")
	timingDown(0, paper_id)


	if __name__ == '__main__':test()