Skip to content

Instantly share code, notes, and snippets.

@jwise77
Created July 9, 2019 20:16
Show Gist options
  • Save jwise77/0738007ff0633d12f6ab1347b13e9c2e to your computer and use it in GitHub Desktop.
Save jwise77/0738007ff0633d12f6ab1347b13e9c2e to your computer and use it in GitHub Desktop.
arXiv downloader
#import xmlrpclib
import xmlrpc.client as xmlrpclib
import os
import urllib.request
papers_dir = "/dir/to/papers"
user = 'username'
passwd = 'password'
server = xmlrpclib.ServerProxy('http://your.url/xmlrpc.php')
class ArXiv:
def __init__(self, url):
self.title = None
self.pdf_name = None
self.authors = []
self.url = url
self.pdf_url = None
sock = urllib.request.urlopen(self.url)
self.raw_html = sock.read()
sock.close()
def parse(self):
lines = self.raw_html.splitlines()
for l in lines:
l = l.decode()
if l.find("/head") >= 0: break
if l.find("citation_author") >= 0:
author = l.split("\"")[3]
self.authors.append(author)
if l.find("citation_title") >= 0:
self.title = l.split("\"")[3]
if l.find("citation_pdf_url") >= 0:
self.pdf_url = l.split("\"")[3]
nauth = len(self.authors)
if nauth == 1:
self.author_list = self.authors[0].split(",")[0]
elif nauth == 2:
self.author_list = self.authors[0].split(",")[0]
self.author_list += " & " + self.authors[1].split(",")[0]
else:
self.author_list = self.authors[0].split(",")[0] + " et al."
def meta_query(self, tags):
print("%s (%s)" % (self.title, self.author_list))
print("-"*72)
this_tag = None
while this_tag != "":
this_tag = input("Enter a paper category (blank to end): ")
if this_tag != "":
tags.append(this_tag)
self.pdf_name = input("Enter PDF filename: ")
return tags
def return_html(self):
return "<li>(<a href=\"%s\">abs</a>, <a href=\"%s\">pdf</a>) %s, <em>%s</em></li>\n" % \
(self.url, self.pdf_url, self.author_list, self.title)
def save_pdf(self):
abs_path = "%s/%s" % (papers_dir, self.pdf_name)
if os.path.exists(abs_path):
overwrite = input("%s exists. Overwrite (y/n)? " % self.pdf_name)
if overwrite.lower() == "y":
os.remove(abs_path)
urllib.request.urlretrieve(self.pdf_url, abs_path)
else:
urllib.request.urlretrieve(self.pdf_url, abs_path)
return
blog_title = input("Blog title: ")
post_text = "<ul>\n"
number = None
tags = []
while number != "":
number = input("Enter ArXiV number (blank to end): ")
if number != "":
url = "http://arxiv.org/abs/%s" % number
#url = "http://xxx.lanl.gov/abs/%s" % number
entry = ArXiv(url)
entry.parse()
tags = entry.meta_query(tags)
entry.save_pdf()
post_text += "\t" + entry.return_html()
post_text += "</ul>\n"
blog_id = 0
blog_content = { "title": blog_title,
"description": post_text,
"mt_keywords": tags }
categories = [{'categoryId' : '4', 'isPrimary' : 1}]
post_id = int(server.metaWeblog.newPost(blog_id, user, passwd, blog_content,0))
server.mt.setPostCategories(post_id, user, passwd, categories) # not work
server.mt.publishPost(post_id, user, passwd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment