jozefg/arxiv_scraper.py

## arxiv_scraper.py
#!/usr/bin/env python3
from datetime import date, timedelta
import argparse
import time
import itertools

import requests
import feedparser

OKBLUE = '\033[94m'
BOLD = '\033[1m'
ENDC = '\033[0m'

CS_CLASSES = [
    'cs.' + cat for cat in [
        'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB',
        'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC',
        'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI',
        'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY',
    ]
]

MATH_CLASSES = [
    'math.' + cat for cat in [
        'AC', 'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS',
        'FA', 'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO',
        'MG', 'MP', 'NA', 'NT', 'OA', 'OC', 'PR', 'QA', 'RA',
        'RT', 'SG', 'SP', 'ST', 'math-ph'
    ]
]

# Which categories do we search
CLASSES = CS_CLASSES + MATH_CLASSES

# Maximum articles from each category
MAX_ARTICLES = 50

# Only show articles since this date.
OLDEST_DATE = date.today() - timedelta(days=7)

# The endpoint for the arxiv api
QUERY_ENDPOINT = 'http://export.arxiv.org/api/query'

def display_article(number, title, authors, link, date, info):
    if len(authors) > 5:
        authors = authors[0:5] + ['et al.']

    print(f'{number}. ' + BOLD + title + ENDC)
    print('\t' + ', '.join(authors))
    print('\t' + time.strftime('%Y-%m-%d', date))
    print('\t' + link)
    if info:
        print('\t' + info)

def show_class(cls, skip, maximum, since, delay=True):
    response = requests.get(
        QUERY_ENDPOINT,
        params = [
            ('search_query', f'cat:{cls}'),
            ('sortBy', 'lastUpdatedDate'),
            ('sortOrder', 'descending'),
            ('start', skip),
            ('max_results', maximum)
        ]
    )
    try:
        response.raise_for_status()
        # As requested by arxiv, sleep for 3 seconds
        if delay:
            time.sleep(3)

    except HTTPError as http_err:
        print(f'Failed to scrape {cls}.')
        print(http_err)

    feed = feedparser.parse(response.content)
    articles = feed['entries']
    for article, i in zip(articles, itertools.count(start=1)):
        if(article.updated_parsed >= since.timetuple()):
            article_info = {
                'title' : article.title.replace("\n ",""),
                'date' : article.updated_parsed,
                'authors' : [author['name'] for author in article.authors],
                'link' : article.link,
                'info' : article.arxiv_journal_ref if 'arxiv_journal_ref' in article else None
            }
            display_article(number = i, **article_info)
        else:
            break


def show(classes, skip, maximum, since):
    for cls in classes[0:-1]:
        print(f'{OKBLUE}Fetching articles from {cls}...{ENDC}')
        show_class(cls, skip, maximum, since)

    # Special case the last item to ensure we encure no unnecessary delays
    show_class(classes[-1], skip, maximum, since, delay=False)

parser = argparse.ArgumentParser(prog='arxiv_scraper')
subcommand_parsers = parser.add_subparsers(required=True, dest="cmd")

# Options and arguments for the show command
show_parser = subcommand_parsers.add_parser("show", help='Show recent articles')
show_parser.add_argument(
    "classes",
    nargs='+',
    choices=CLASSES,
    help="Last of subject classifications to scrape.",
    metavar='CLASS'
)

show_parser.add_argument(
    "--max",
    help=f"Maximum number of articles to display, default is {MAX_ARTICLES}",
    default = MAX_ARTICLES
)

show_parser.add_argument(
    "--skip",
    help=f"Skip the first SKIP entries, default is 0",
    default = 0
)

show_parser.add_argument(
    "--since",
    help=f"Only show articles since YYYY-MM-DD, default is {OLDEST_DATE}",
    default = OLDEST_DATE.isoformat()
)

args = vars(parser.parse_args())
if args['cmd'] == 'show':
    show(
        classes = args['classes'],
        maximum = args['max'],
        skip = args['skip'],
        since = date.fromisoformat(args['since'])
    )
	#!/usr/bin/env python3
	from datetime import date, timedelta
	import argparse
	import time
	import itertools

	import requests
	import feedparser

	OKBLUE = '\033[94m'
	BOLD = '\033[1m'
	ENDC = '\033[0m'

	CS_CLASSES = [
	'cs.' + cat for cat in [
	'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB',
	'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC',
	'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI',
	'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY',
	]
	]

	MATH_CLASSES = [
	'math.' + cat for cat in [
	'AC', 'AG', 'AP', 'AT', 'CA', 'CO', 'CT', 'CV', 'DG', 'DS',
	'FA', 'GM', 'GN', 'GR', 'GT', 'HO', 'IT', 'KT', 'LO',
	'MG', 'MP', 'NA', 'NT', 'OA', 'OC', 'PR', 'QA', 'RA',
	'RT', 'SG', 'SP', 'ST', 'math-ph'
	]
	]

	# Which categories do we search
	CLASSES = CS_CLASSES + MATH_CLASSES

	# Maximum articles from each category
	MAX_ARTICLES = 50

	# Only show articles since this date.
	OLDEST_DATE = date.today() - timedelta(days=7)

	# The endpoint for the arxiv api
	QUERY_ENDPOINT = 'http://export.arxiv.org/api/query'

	def display_article(number, title, authors, link, date, info):
	if len(authors) > 5:
	authors = authors[0:5] + ['et al.']

	print(f'{number}. ' + BOLD + title + ENDC)
	print('\t' + ', '.join(authors))
	print('\t' + time.strftime('%Y-%m-%d', date))
	print('\t' + link)
	if info:
	print('\t' + info)

	def show_class(cls, skip, maximum, since, delay=True):
	response = requests.get(
	QUERY_ENDPOINT,
	params = [
	('search_query', f'cat:{cls}'),
	('sortBy', 'lastUpdatedDate'),
	('sortOrder', 'descending'),
	('start', skip),
	('max_results', maximum)
	]
	)
	try:
	response.raise_for_status()
	# As requested by arxiv, sleep for 3 seconds
	if delay:
	time.sleep(3)

	except HTTPError as http_err:
	print(f'Failed to scrape {cls}.')
	print(http_err)

	feed = feedparser.parse(response.content)
	articles = feed['entries']
	for article, i in zip(articles, itertools.count(start=1)):
	if(article.updated_parsed >= since.timetuple()):
	article_info = {
	'title' : article.title.replace("\n ",""),
	'date' : article.updated_parsed,
	'authors' : [author['name'] for author in article.authors],
	'link' : article.link,
	'info' : article.arxiv_journal_ref if 'arxiv_journal_ref' in article else None
	}
	display_article(number = i, **article_info)
	else:
	break


	def show(classes, skip, maximum, since):
	for cls in classes[0:-1]:
	print(f'{OKBLUE}Fetching articles from {cls}...{ENDC}')
	show_class(cls, skip, maximum, since)

	# Special case the last item to ensure we encure no unnecessary delays
	show_class(classes[-1], skip, maximum, since, delay=False)

	parser = argparse.ArgumentParser(prog='arxiv_scraper')
	subcommand_parsers = parser.add_subparsers(required=True, dest="cmd")

	# Options and arguments for the show command
	show_parser = subcommand_parsers.add_parser("show", help='Show recent articles')
	show_parser.add_argument(
	"classes",
	nargs='+',
	choices=CLASSES,
	help="Last of subject classifications to scrape.",
	metavar='CLASS'
	)

	show_parser.add_argument(
	"--max",
	help=f"Maximum number of articles to display, default is {MAX_ARTICLES}",
	default = MAX_ARTICLES
	)

	show_parser.add_argument(
	"--skip",
	help=f"Skip the first SKIP entries, default is 0",
	default = 0
	)

	show_parser.add_argument(
	"--since",
	help=f"Only show articles since YYYY-MM-DD, default is {OLDEST_DATE}",
	default = OLDEST_DATE.isoformat()
	)

	args = vars(parser.parse_args())
	if args['cmd'] == 'show':
	show(
	classes = args['classes'],
	maximum = args['max'],
	skip = args['skip'],
	since = date.fromisoformat(args['since'])
	)