Skip to content

Instantly share code, notes, and snippets.

@jozefg
Last active February 1, 2024 01:28
Show Gist options
  • Save jozefg/c2542f51a0b9b3f6efe528fcec90e334 to your computer and use it in GitHub Desktop.
Save jozefg/c2542f51a0b9b3f6efe528fcec90e334 to your computer and use it in GitHub Desktop.
Query the arxiv api for latest articles in a given classification.
#!/usr/bin/env python3
from datetime import date, timedelta
import argparse
import time
import itertools
import requests
import feedparser
OKBLUE = '\033[94m'
BOLD = '\033[1m'
ENDC = '\033[0m'
CS_CLASSES = [
'cs.' + cat for cat in [
'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB',
'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC',
'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI',
'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY',
]
]
MATH_CLASSES = [
'math.' + cat for cat in [
'AC', 'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS',
'FA', 'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO',
'MG', 'MP', 'NA', 'NT', 'OA', 'OC', 'PR', 'QA', 'RA',
'RT', 'SG', 'SP', 'ST', 'math-ph'
]
]
# Which categories do we search
CLASSES = CS_CLASSES + MATH_CLASSES
# Maximum articles from each category
MAX_ARTICLES = 50
# Only show articles since this date.
OLDEST_DATE = date.today() - timedelta(days=7)
# The endpoint for the arxiv api
QUERY_ENDPOINT = 'http://export.arxiv.org/api/query'
def display_article(number, title, authors, link, date, info):
if len(authors) > 5:
authors = authors[0:5] + ['et al.']
print(f'{number}. ' + BOLD + title + ENDC)
print('\t' + ', '.join(authors))
print('\t' + time.strftime('%Y-%m-%d', date))
print('\t' + link)
if info:
print('\t' + info)
def show_class(cls, skip, maximum, since, delay=True):
response = requests.get(
QUERY_ENDPOINT,
params = [
('search_query', f'cat:{cls}'),
('sortBy', 'lastUpdatedDate'),
('sortOrder', 'descending'),
('start', skip),
('max_results', maximum)
]
)
try:
response.raise_for_status()
# As requested by arxiv, sleep for 3 seconds
if delay:
time.sleep(3)
except HTTPError as http_err:
print(f'Failed to scrape {cls}.')
print(http_err)
feed = feedparser.parse(response.content)
articles = feed['entries']
for article, i in zip(articles, itertools.count(start=1)):
if(article.updated_parsed >= since.timetuple()):
article_info = {
'title' : article.title.replace("\n ",""),
'date' : article.updated_parsed,
'authors' : [author['name'] for author in article.authors],
'link' : article.link,
'info' : article.arxiv_journal_ref if 'arxiv_journal_ref' in article else None
}
display_article(number = i, **article_info)
else:
break
def show(classes, skip, maximum, since):
for cls in classes[0:-1]:
print(f'{OKBLUE}Fetching articles from {cls}...{ENDC}')
show_class(cls, skip, maximum, since)
# Special case the last item to ensure we encure no unnecessary delays
show_class(classes[-1], skip, maximum, since, delay=False)
parser = argparse.ArgumentParser(prog='arxiv_scraper')
subcommand_parsers = parser.add_subparsers(required=True, dest="cmd")
# Options and arguments for the show command
show_parser = subcommand_parsers.add_parser("show", help='Show recent articles')
show_parser.add_argument(
"classes",
nargs='+',
choices=CLASSES,
help="Last of subject classifications to scrape.",
metavar='CLASS'
)
show_parser.add_argument(
"--max",
help=f"Maximum number of articles to display, default is {MAX_ARTICLES}",
default = MAX_ARTICLES
)
show_parser.add_argument(
"--skip",
help=f"Skip the first SKIP entries, default is 0",
default = 0
)
show_parser.add_argument(
"--since",
help=f"Only show articles since YYYY-MM-DD, default is {OLDEST_DATE}",
default = OLDEST_DATE.isoformat()
)
args = vars(parser.parse_args())
if args['cmd'] == 'show':
show(
classes = args['classes'],
maximum = args['max'],
skip = args['skip'],
since = date.fromisoformat(args['since'])
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment