Last active
December 21, 2015 09:59
-
-
Save willwade/6288982 to your computer and use it in GitHub Desktop.
This takes one local bibtex file which may have pdf attachments linked to it - and then looks at another citeulike users account for any pdf's that may be missing. NB: I would ideally like to make use of the JSON API for CiteULike but its a little hazy on the uploading. NB2: I needed to patch my httplib with this: http://bugs.python.org/issue11898…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Contact: Will Wade willwa.de | |
# Date: April 2013 | |
# Needs mechanize and pybtex | |
# | |
# Give a user name on citeulike and their pass | |
# Provide with a local bibtex file which has links to attachments | |
# upload any attachments to the users citeulike library | |
import mechanize | |
import time | |
from pybtex.database.input import bibtex | |
import os, urllib | |
import simplejson as json | |
# settings | |
cUser = 'testuser' | |
cPass = 'testpass' | |
localbib = '/Users/willwade/Dropbox/Papers/willwade.bib' | |
localpapers = '/Users/willwade/Dropbox/Papers/' | |
tempdir = '/Users/willwade/Desktop/' | |
class CulError(Exception): | |
pass | |
class CiteULikeReader(object): | |
MIN_API_WAIT = 5 | |
def __init__(self, user, password, localbib='', localpapers='.',tempdir='/tmp/'): | |
""" Start up... """ | |
self.cUser = user | |
self.cPass = password | |
self.loggedin = False | |
self.cites = '' | |
self.localbib = localbib | |
self.localpapers = localpapers | |
self.tempdir = tempdir | |
self.last_api_access = time.time() - self.MIN_API_WAIT | |
checkparser = bibtex.Parser() | |
self.checkbibdata = checkparser.parse_file(localbib) | |
self.loginToCiteULike() | |
def wait_for_api_limit(self, min_wait=0): | |
min_wait = max(min_wait, self.MIN_API_WAIT) | |
now = time.time() | |
elapsed_time = now - self.last_api_access | |
if elapsed_time<min_wait: | |
time.sleep(min_wait-elapsed_time) | |
self.last_api_access = time.time() | |
def loginToCiteULike(self): | |
""" | |
Handle login. This should populate our cookie jar. | |
""" | |
self.browser = mechanize.Browser() | |
self.browser.set_handle_robots(False) | |
self.browser.addheaders = [ | |
("User-agent", 'willwade/willwade@gmail.com citeusyncpy/1.0'), | |
] | |
self.browser.open('http://www.citeulike.org/login?from=/') | |
self.browser.select_form(name='frm') | |
self.browser["username"] = self.cUser | |
self.browser["password"] = self.cPass | |
self.loggedin = True | |
self.wait_for_api_limit() | |
try: | |
#handle redirects manually to avoid connection flakiness | |
self.browser.set_handle_redirect(False) | |
resp = self.browser.submit() | |
except mechanize.HTTPError, e: | |
#This may not work for non-gold users. See http://www.citeulike.org/groupforum/2949?highlight=41927#msg_41927 for ideas.. feel free to write | |
if e.getcode()!=302 : raise e | |
next_page = e.info().getheader('Location') | |
if next_page == 'http://www.citeulike.org/' : | |
#success | |
self.logged_in = True | |
elif next_page.find('status=login-failed')>=0: | |
raise CulError('Login Failed') | |
else: | |
err = CulError('Unknown login response') | |
err.data = e | |
raise err | |
finally: | |
self.browser.set_handle_redirect(True) | |
#return ''.join(response.readlines()) | |
def loginToCiteULikeJSON(self): | |
""" | |
Handle login. This should populate our cookie jar. NB: I'd like this to work | |
""" | |
self.browser = mechanize.Browser() | |
self.browser.set_handle_robots(False) | |
self.browser.addheaders = [ | |
("User-agent", 'willwade/willwade@gmail.com citeuulpy/1.0'), | |
] | |
data = json.dumps({'username':self.cUser, 'password':self.cPass}) | |
res = self.browser.open('http://www.citeulike.org/login.json', data=json.dumps(data)) | |
print res | |
self.loggedin = True | |
self.wait_for_api_limit() | |
def uploadFileToCitationJSON(self,artid, file): | |
data = {'username' : self.cUser, | |
'article_id' : artid, | |
'file' : open(file, 'rb') | |
} | |
self.browser.addheaders = {'Content-type': 'multipart/form-data'} | |
self.browser.open('http://www.citeulike.org/personal_pdf_upload.json', json.dumps(data)) | |
self.wait_for_api_limit() | |
print self.browser.response().get_data() | |
exit() | |
return r | |
def getBibText(self): | |
self.browser.retrieve('http://www.citeulike.org/bibtex/user/'+self.cUser+'?do_username_prefix=0&key_type=4&incl_amazon=0&clean_urls=1&smart_wrap=0&export_attachment_names=t&fieldmap=posted-at:date-added',self.tempdir+self.cUser+'.bib') | |
def doesCiteExist(self,citation): | |
#loop through the individual references | |
for bib_id in self.checkbibdata.entries: | |
if bib_id == citation: | |
b = self.checkbibdata.entries[bib_id].fields | |
if "citeulike-attachment-1" in b: | |
filedl = b["citeulike-attachment-1"].split(';')[1].strip() | |
file_name = filedl.split('/')[7] | |
return file_name | |
return False | |
def uploadFileToCitationMech(self,artid,file): | |
print 'so want to upload', file, ' to ', artid | |
self.browser.open('http://www.citeulike.org/user/'+cUser+'/article/'+artid) | |
self.browser.select_form(name="fileupload_frm") | |
# NB: This will break if not /users/willwade/Dropbox/papers | |
self.browser.form.add_file(open(file, 'rt'), 'application/pdf'.encode('ascii'), filename=file.split('/')[5], name='file') | |
try: | |
resp = self.browser.submit() | |
self.wait_for_api_limit() | |
except mechanize.HTTPError, e: | |
print 'error' | |
print e.getcode() | |
print resp.read() | |
exit() | |
def uploadFileToCitation(self,artid,file): | |
print 'so want to upload', file, ' to ', artid | |
data = urllib.urlencode({'username' : self.cUser, | |
'article_id' : artid, | |
'file' : open(file, 'rb'), | |
'check' : 'v2' | |
}) | |
print {'username' : self.cUser, | |
'article_id' : artid, | |
'file' : file, | |
'check' : 'v2' | |
} | |
self.browser.open('http://www.citeulike.org/personal_pdf_upload', data) | |
print self.browser.response().get_data() | |
exit() | |
def parseUserBibTex(self): | |
# ok so go through the downloaded bibtex file. If there is an attachment move on. if there isn't lets find one.. | |
print self.tempdir+self.cUser+'.bib' | |
parser = bibtex.Parser() | |
bibdata = parser.parse_file(self.tempdir+self.cUser+'.bib') | |
#loop through the individual references | |
for bib_id in bibdata.entries: | |
# does the entry already have a attachment? If so lets move on.. | |
b = bibdata.entries[bib_id].fields | |
if "citeulike-attachment-1" not in b: | |
fileorfalse = self.doesCiteExist(bib_id) | |
if fileorfalse: | |
print self.localpapers+fileorfalse | |
if os.path.isfile(self.localpapers+fileorfalse): | |
print 'exists & ready to upload' | |
self.uploadFileToCitationMech(b['citeulike-article-id'],self.localpapers+fileorfalse) | |
else: | |
print bib_id,'...exists' | |
cureader = CiteULikeReader(cUser, cPass, localbib, localpapers, tempdir) | |
cureader.getBibText() | |
cureader.parseUserBibTex() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment