Skip to content

Instantly share code, notes, and snippets.

@mgrube

mgrube/stuff Secret

Created August 6, 2016 18:09
Show Gist options
  • Save mgrube/cd01c37646321c1c52bb1531d026a040 to your computer and use it in GitHub Desktop.
Save mgrube/cd01c37646321c1c52bb1531d026a040 to your computer and use it in GitHub Desktop.
import httplib2
import feedparser
import os
import urllib
import time
#Search Arxiv for something.
#Return XML response
#Masquerade as Windows 7 user with Firefox 30
def search(searchTerm, maxresults):
#searchTerm = urllib.urlencode(searchTerm)
searchTerm = searchTerm.replace(" ", "%20")
url = "http://export.arxiv.org/api/query?search_query=all:" + searchTerm + "&start=0&max_results=" + str(maxresults)
print "URL: " + url
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20131011 Firefox/30.0'}
http = httplib2.Http()
response, content = http.request(url, 'GET', headers=headers)
return content
#This is where we do something useful :3
def pdfDownload(article_url):
http = httplib2.Http()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20131011 Firefox/30.0'}
article_url = article_url.replace('/abs/', '/pdf/')
response, content = http.request(article_url, 'GET', headers=headers)
filename = article_url.split("/")[-1] + ".pdf"
pdf = open(filename, 'wb')
pdf.write(content)
print 'Downloaded ' + filename
#Takes an article URL, converts to a pdf filename and
#checks the current directory to see if we already have it
def isNew(articleid):
filename = articleid.split("/")[-1] + ".pdf"
if filename in os.listdir("./"):
return False
else:
return True
#Load our search phrases from our searchterms file
searchtermsfile = open('searchterms.txt', 'rb')
searchterms = list()
for line in searchtermsfile:
searchterms.append(line)
for s in searchterms:
print "Grabbing pdfs related to " + s.strip()
results = search(s.strip(), 250)
tree = feedparser.parse(results)
print "Got " + str(len(tree['entries'])) + " entries"
for e in tree['entries']:
if isNew(e['id']):
pdfDownload(e['id'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment