Created
July 9, 2019 20:16
-
-
Save jwise77/0738007ff0633d12f6ab1347b13e9c2e to your computer and use it in GitHub Desktop.
arXiv downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import xmlrpclib | |
import xmlrpc.client as xmlrpclib | |
import os | |
import urllib.request | |
papers_dir = "/dir/to/papers" | |
user = 'username' | |
passwd = 'password' | |
server = xmlrpclib.ServerProxy('http://your.url/xmlrpc.php') | |
class ArXiv: | |
def __init__(self, url): | |
self.title = None | |
self.pdf_name = None | |
self.authors = [] | |
self.url = url | |
self.pdf_url = None | |
sock = urllib.request.urlopen(self.url) | |
self.raw_html = sock.read() | |
sock.close() | |
def parse(self): | |
lines = self.raw_html.splitlines() | |
for l in lines: | |
l = l.decode() | |
if l.find("/head") >= 0: break | |
if l.find("citation_author") >= 0: | |
author = l.split("\"")[3] | |
self.authors.append(author) | |
if l.find("citation_title") >= 0: | |
self.title = l.split("\"")[3] | |
if l.find("citation_pdf_url") >= 0: | |
self.pdf_url = l.split("\"")[3] | |
nauth = len(self.authors) | |
if nauth == 1: | |
self.author_list = self.authors[0].split(",")[0] | |
elif nauth == 2: | |
self.author_list = self.authors[0].split(",")[0] | |
self.author_list += " & " + self.authors[1].split(",")[0] | |
else: | |
self.author_list = self.authors[0].split(",")[0] + " et al." | |
def meta_query(self, tags): | |
print("%s (%s)" % (self.title, self.author_list)) | |
print("-"*72) | |
this_tag = None | |
while this_tag != "": | |
this_tag = input("Enter a paper category (blank to end): ") | |
if this_tag != "": | |
tags.append(this_tag) | |
self.pdf_name = input("Enter PDF filename: ") | |
return tags | |
def return_html(self): | |
return "<li>(<a href=\"%s\">abs</a>, <a href=\"%s\">pdf</a>) %s, <em>%s</em></li>\n" % \ | |
(self.url, self.pdf_url, self.author_list, self.title) | |
def save_pdf(self): | |
abs_path = "%s/%s" % (papers_dir, self.pdf_name) | |
if os.path.exists(abs_path): | |
overwrite = input("%s exists. Overwrite (y/n)? " % self.pdf_name) | |
if overwrite.lower() == "y": | |
os.remove(abs_path) | |
urllib.request.urlretrieve(self.pdf_url, abs_path) | |
else: | |
urllib.request.urlretrieve(self.pdf_url, abs_path) | |
return | |
blog_title = input("Blog title: ") | |
post_text = "<ul>\n" | |
number = None | |
tags = [] | |
while number != "": | |
number = input("Enter ArXiV number (blank to end): ") | |
if number != "": | |
url = "http://arxiv.org/abs/%s" % number | |
#url = "http://xxx.lanl.gov/abs/%s" % number | |
entry = ArXiv(url) | |
entry.parse() | |
tags = entry.meta_query(tags) | |
entry.save_pdf() | |
post_text += "\t" + entry.return_html() | |
post_text += "</ul>\n" | |
blog_id = 0 | |
blog_content = { "title": blog_title, | |
"description": post_text, | |
"mt_keywords": tags } | |
categories = [{'categoryId' : '4', 'isPrimary' : 1}] | |
post_id = int(server.metaWeblog.newPost(blog_id, user, passwd, blog_content,0)) | |
server.mt.setPostCategories(post_id, user, passwd, categories) # not work | |
server.mt.publishPost(post_id, user, passwd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment