-
-
Save mgrube/cd01c37646321c1c52bb1531d026a040 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httplib2 | |
import feedparser | |
import os | |
import urllib | |
import time | |
#Search Arxiv for something. | |
#Return XML response | |
#Masquerade as Windows 7 user with Firefox 30 | |
def search(searchTerm, maxresults): | |
#searchTerm = urllib.urlencode(searchTerm) | |
searchTerm = searchTerm.replace(" ", "%20") | |
url = "http://export.arxiv.org/api/query?search_query=all:" + searchTerm + "&start=0&max_results=" + str(maxresults) | |
print "URL: " + url | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20131011 Firefox/30.0'} | |
http = httplib2.Http() | |
response, content = http.request(url, 'GET', headers=headers) | |
return content | |
#This is where we do something useful :3 | |
def pdfDownload(article_url): | |
http = httplib2.Http() | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20131011 Firefox/30.0'} | |
article_url = article_url.replace('/abs/', '/pdf/') | |
response, content = http.request(article_url, 'GET', headers=headers) | |
filename = article_url.split("/")[-1] + ".pdf" | |
pdf = open(filename, 'wb') | |
pdf.write(content) | |
print 'Downloaded ' + filename | |
#Takes an article URL, converts to a pdf filename and | |
#checks the current directory to see if we already have it | |
def isNew(articleid): | |
filename = articleid.split("/")[-1] + ".pdf" | |
if filename in os.listdir("./"): | |
return False | |
else: | |
return True | |
#Load our search phrases from our searchterms file | |
searchtermsfile = open('searchterms.txt', 'rb') | |
searchterms = list() | |
for line in searchtermsfile: | |
searchterms.append(line) | |
for s in searchterms: | |
print "Grabbing pdfs related to " + s.strip() | |
results = search(s.strip(), 250) | |
tree = feedparser.parse(results) | |
print "Got " + str(len(tree['entries'])) + " entries" | |
for e in tree['entries']: | |
if isNew(e['id']): | |
pdfDownload(e['id']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment