Install required packages
pip install -r requirements.txt
Run Scrapper
python arxiv_scrapper.py
import urllib | |
from bs4 import BeautifulSoup | |
from pathlib import Path | |
import datetime | |
import time | |
import os | |
import requests | |
import warnings | |
warnings.filterwarnings('ignore') | |
last_run = '' | |
temp_file = os.path.join('data','tmp','time.txt') | |
current_time = datetime.datetime.now() | |
print('Checking for required files') | |
req_file = Path(temp_file) | |
if req_file.is_file(): | |
print('File Exists. Getting last run time') | |
with open(temp_file, 'r') as f: | |
last_run = datetime.datetime.fromtimestamp(int(f.read())) | |
f.close() | |
print(str(last_run)) | |
f = open(temp_file, 'w') | |
f.write(str(int(time.mktime(current_time.timetuple())))) | |
f.close() | |
time_diff = current_time - last_run | |
else: | |
print('Files do not exist. Creating files.') | |
os.makedirs(os.path.join('data','tmp')) | |
os.makedirs(os.path.join('data','pdf')) | |
f = open(temp_file, 'w') | |
last_run = current_time | |
f.write(str(int(time.mktime(current_time.timetuple())))) | |
f.close() | |
time_diff = datetime.timedelta(hours = 24) | |
print('Getting data\n--') | |
page_url = 'https://twitter.com/arXiv__ml' | |
page = urllib.request.urlopen(page_url) | |
soup = BeautifulSoup(page) | |
cnt = 0 | |
tweets = soup.find_all('div', attrs = {'class': 'content'}) | |
for twt in tweets: | |
time_delay = twt.find_all('span', attrs = {'class': '_timestamp'}) | |
if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))): | |
print('New content found. Getting resources. This might take a while') | |
try: | |
anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'}) | |
link = anchor[0]['title'] | |
redir_page = urllib.request.urlopen(link) | |
redir_soup = BeautifulSoup(redir_page) | |
pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'}) | |
save_loc = pdf_anchor[0]['href'][1:]+'.pdf' | |
pdf_url = 'https://arxiv.org/'+save_loc | |
print('Getting PDF. This might take a while too...') | |
pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+') | |
res = urllib.request.urlopen(pdf_url) | |
pdf.write(res.read()) | |
pdf.close() | |
print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--') | |
except: | |
print('Problem getting file. Moving to next file.\n--') | |
else: | |
print('No new content found. Maybe, try after a while. Quitting\n--') | |
break | |
print('Bye') |
beautifulsoup4==4.7.1 | |
bs4==0.0.1 | |
certifi==2019.6.16 | |
chardet==3.0.4 | |
idna==2.8 | |
pathlib==1.0.1 | |
requests==2.22.0 | |
soupsieve==1.9.1 | |
urllib3==1.25.3 |