Skip to content

Instantly share code, notes, and snippets.

@BBloggsbott
Last active September 19, 2019 05:08
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save BBloggsbott/3d86635e67944c72712c89195555d95f to your computer and use it in GitHub Desktop.
Save BBloggsbott/3d86635e67944c72712c89195555d95f to your computer and use it in GitHub Desktop.
A python script that will get the latest papers in machine learning from arxiv.org

Arxiv Scrapper (ML)

Install required packages

pip install -r requirements.txt

Run Scrapper

python arxiv_scrapper.py
import urllib
from bs4 import BeautifulSoup
from pathlib import Path
import datetime
import time
import os
import requests
import warnings
warnings.filterwarnings('ignore')
last_run = ''
temp_file = os.path.join('data','tmp','time.txt')
current_time = datetime.datetime.now()
print('Checking for required files')
req_file = Path(temp_file)
if req_file.is_file():
print('File Exists. Getting last run time')
with open(temp_file, 'r') as f:
last_run = datetime.datetime.fromtimestamp(int(f.read()))
f.close()
print(str(last_run))
f = open(temp_file, 'w')
f.write(str(int(time.mktime(current_time.timetuple()))))
f.close()
time_diff = current_time - last_run
else:
print('Files do not exist. Creating files.')
os.makedirs(os.path.join('data','tmp'))
os.makedirs(os.path.join('data','pdf'))
f = open(temp_file, 'w')
last_run = current_time
f.write(str(int(time.mktime(current_time.timetuple()))))
f.close()
time_diff = datetime.timedelta(hours = 24)
print('Getting data\n--')
page_url = 'https://twitter.com/arXiv__ml'
page = urllib.request.urlopen(page_url)
soup = BeautifulSoup(page)
cnt = 0
tweets = soup.find_all('div', attrs = {'class': 'content'})
for twt in tweets:
time_delay = twt.find_all('span', attrs = {'class': '_timestamp'})
if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))):
print('New content found. Getting resources. This might take a while')
try:
anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'})
link = anchor[0]['title']
redir_page = urllib.request.urlopen(link)
redir_soup = BeautifulSoup(redir_page)
pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'})
save_loc = pdf_anchor[0]['href'][1:]+'.pdf'
pdf_url = 'https://arxiv.org/'+save_loc
print('Getting PDF. This might take a while too...')
pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+')
res = urllib.request.urlopen(pdf_url)
pdf.write(res.read())
pdf.close()
print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--')
except:
print('Problem getting file. Moving to next file.\n--')
else:
print('No new content found. Maybe, try after a while. Quitting\n--')
break
print('Bye')
beautifulsoup4==4.7.1
bs4==0.0.1
certifi==2019.6.16
chardet==3.0.4
idna==2.8
pathlib==1.0.1
requests==2.22.0
soupsieve==1.9.1
urllib3==1.25.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment