Created
February 26, 2018 13:04
-
-
Save tanakatsu/1890feae1a29490a21d57892a1dab10c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from time import sleep | |
import feedparser | |
import pandas as pd | |
import requests | |
from constants import BASE_URL, BRUNDAGE_CATEGORIES, PICKLE_PATH, QUERY_PAGE_SIZE, QUERY_WAIT_TIME | |
def check_for_update(): | |
"""Return true if an update to the arxiv is online""" | |
prev_data = pd.read_pickle(PICKLE_PATH) | |
prev_latest = prev_data.published.max().date() | |
# if we already have today's papers we are all set | |
if prev_latest == pd.Timestamp('now').date(): | |
print 'Already have today\'s papers' | |
return False | |
# otherwise poll the arxiv by requesting a single paper and checking its date | |
params = { | |
'search_query': 'cat:stat.ml', | |
'sortBy': 'submittedDate', | |
'start': 0, | |
'max_results': 1 | |
} | |
param_string = 'search_query={search_query}&sortBy={sortBy}&start={start}&max_results={max_results}'.format( | |
**params) | |
response = requests.get(BASE_URL, params=param_string) | |
parsed = feedparser.parse(response.text) | |
if len(parsed.entries) > 0: | |
parsed_date = pd.Timestamp(parsed.entries[0]['published']).date() | |
if parsed_date > prev_latest: | |
print 'New papers found' | |
return True | |
else: | |
print 'Found papers from {}, but latest is from {}'.format(parsed_date, prev_latest) | |
return False | |
print 'Could not parse any papers' | |
return False | |
def get_entry_dict(entry): | |
"""Return a dictionary with the items we want from a feedparser entry""" | |
try: | |
return { | |
'title': entry['title'], | |
'authors': [a['name'] for a in entry['authors']], | |
'published': pd.Timestamp(entry['published']), | |
'summary': entry['summary'], | |
'link': entry['link'], | |
'category': entry['category'], | |
} | |
except KeyError: | |
print('Missing keys in row: {}'.format(entry)) | |
return None | |
def strip_version(link): | |
"""Strip version number from arXiv paper link""" | |
return link[:-2] | |
def load(): | |
"""Load data from pickle and remove duplicates""" | |
df = pd.read_pickle(PICKLE_PATH) | |
return (df.sort_values('published', ascending=False) | |
.groupby('link').first().reset_index()) | |
def fetch_updated_data(prev_data): | |
"""Get new papers from arxiv server""" | |
def make_query_string(categories): | |
return '+OR+'.join(['cat:' + c for c in categories]) | |
past_links = prev_data.link.apply(strip_version) | |
i = 0 | |
j = 0 | |
while True: | |
params = { | |
'search_query': make_query_string(BRUNDAGE_CATEGORIES), | |
'sortBy': 'submittedDate', | |
'start': QUERY_PAGE_SIZE * i, | |
'max_results': QUERY_PAGE_SIZE, | |
} | |
param_string = 'search_query={search_query}&sortBy={sortBy}&start={start}&max_results={max_results}'.format( | |
**params) | |
response = requests.get(BASE_URL, params=param_string) | |
parsed = feedparser.parse(response.text) | |
entries = parsed.entries | |
if len(entries) == 0: | |
print("no entries") | |
j += 1 | |
if j >= 30: | |
break | |
sleep(QUERY_WAIT_TIME) | |
continue | |
parsed_entries = [get_entry_dict(e) for e in entries] | |
results_df = pd.DataFrame(parsed_entries) | |
print ('Fetched {} abstracts published {} and earlier').format(results_df.shape[0], | |
results_df.published.max().date()) | |
new_links = ~results_df.link.apply(strip_version).isin(past_links) | |
if not new_links.any(): | |
break | |
if len(prev_data) == 0: | |
prev_data = results_df.loc[new_links] | |
else: | |
prev_data = pd.concat((prev_data, results_df.loc[new_links])) | |
i += 1 | |
j = 0 | |
sleep(QUERY_WAIT_TIME) | |
return prev_data | |
def update_arxiv(): | |
"""Update arxiv data pickle witht the latest abstracts""" | |
if os.path.exists(PICKLE_PATH): | |
prev_data = pd.read_pickle(PICKLE_PATH) | |
else: | |
prev_data = pd.DataFrame(columns=["authors","category","link","published","summary","title"], dtype="object") | |
prev_data.published = prev_data.published.astype("datetime64[ns]") | |
updated_data = fetch_updated_data(prev_data) | |
print ('Downloaded {} new abstracts'.format(updated_data.shape[0] - prev_data.shape[0])) | |
updated_data.to_pickle(PICKLE_PATH) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment