BBloggsbott/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Arxiv Scrapper (ML)

Install required packages
pip install -r requirements.txt
Run Scrapper
python arxiv_scrapper.py

  
## arxiv_scrapper.py
import urllib
from bs4 import BeautifulSoup
from pathlib import Path
import datetime
import time
import os
import requests
import warnings
warnings.filterwarnings('ignore')

last_run = ''
temp_file = os.path.join('data','tmp','time.txt')
current_time = datetime.datetime.now()
print('Checking for required files')
req_file = Path(temp_file)
if req_file.is_file():
	print('File Exists. Getting last run time')
	with open(temp_file, 'r') as f:
		last_run = datetime.datetime.fromtimestamp(int(f.read()))
		f.close()
	print(str(last_run))
	f = open(temp_file, 'w')
	f.write(str(int(time.mktime(current_time.timetuple()))))
	f.close()
	time_diff = current_time - last_run
else:
	print('Files do not exist. Creating files.')
	os.makedirs(os.path.join('data','tmp'))
	os.makedirs(os.path.join('data','pdf'))
	f = open(temp_file, 'w')
	last_run = current_time
	f.write(str(int(time.mktime(current_time.timetuple()))))
	f.close()
	time_diff = datetime.timedelta(hours = 24)

print('Getting data\n--')
page_url = 'https://twitter.com/arXiv__ml'
page = urllib.request.urlopen(page_url)
soup = BeautifulSoup(page)
cnt = 0
tweets = soup.find_all('div', attrs = {'class': 'content'})
for twt in tweets:
	time_delay = twt.find_all('span', attrs = {'class': '_timestamp'})
	if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))):
		print('New content found. Getting resources. This might take a while')
		try:
			anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'})
			link = anchor[0]['title']
			redir_page = urllib.request.urlopen(link)
			redir_soup = BeautifulSoup(redir_page)
			pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'})
			save_loc = pdf_anchor[0]['href'][1:]+'.pdf'
			pdf_url = 'https://arxiv.org/'+save_loc
			print('Getting PDF. This might take a while too...')
			pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+')
			res = urllib.request.urlopen(pdf_url)
			pdf.write(res.read())
			pdf.close()
			print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--')
		except:
			print('Problem getting file. Moving to next file.\n--')
	else:
		print('No new content found. Maybe, try after a while. Quitting\n--')
		break

print('Bye')

## requirements.txt
beautifulsoup4==4.7.1
bs4==0.0.1
certifi==2019.6.16
chardet==3.0.4
idna==2.8
pathlib==1.0.1
requests==2.22.0
soupsieve==1.9.1
urllib3==1.25.3
	import urllib
	from bs4 import BeautifulSoup
	from pathlib import Path
	import datetime
	import time
	import os
	import requests
	import warnings
	warnings.filterwarnings('ignore')

	last_run = ''
	temp_file = os.path.join('data','tmp','time.txt')
	current_time = datetime.datetime.now()
	print('Checking for required files')
	req_file = Path(temp_file)
	if req_file.is_file():
	print('File Exists. Getting last run time')
	with open(temp_file, 'r') as f:
	last_run = datetime.datetime.fromtimestamp(int(f.read()))
	f.close()
	print(str(last_run))
	f = open(temp_file, 'w')
	f.write(str(int(time.mktime(current_time.timetuple()))))
	f.close()
	time_diff = current_time - last_run
	else:
	print('Files do not exist. Creating files.')
	os.makedirs(os.path.join('data','tmp'))
	os.makedirs(os.path.join('data','pdf'))
	f = open(temp_file, 'w')
	last_run = current_time
	f.write(str(int(time.mktime(current_time.timetuple()))))
	f.close()
	time_diff = datetime.timedelta(hours = 24)

	print('Getting data\n--')
	page_url = 'https://twitter.com/arXiv__ml'
	page = urllib.request.urlopen(page_url)
	soup = BeautifulSoup(page)
	cnt = 0
	tweets = soup.find_all('div', attrs = {'class': 'content'})
	for twt in tweets:
	time_delay = twt.find_all('span', attrs = {'class': '_timestamp'})
	if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))):
	print('New content found. Getting resources. This might take a while')
	try:
	anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'})
	link = anchor[0]['title']
	redir_page = urllib.request.urlopen(link)
	redir_soup = BeautifulSoup(redir_page)
	pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'})
	save_loc = pdf_anchor[0]['href'][1:]+'.pdf'
	pdf_url = 'https://arxiv.org/'+save_loc
	print('Getting PDF. This might take a while too...')
	pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+')
	res = urllib.request.urlopen(pdf_url)
	pdf.write(res.read())
	pdf.close()
	print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--')
	except:
	print('Problem getting file. Moving to next file.\n--')
	else:
	print('No new content found. Maybe, try after a while. Quitting\n--')
	break

	print('Bye')
	beautifulsoup4==4.7.1
	bs4==0.0.1
	certifi==2019.6.16
	chardet==3.0.4
	idna==2.8
	pathlib==1.0.1
	requests==2.22.0
	soupsieve==1.9.1
	urllib3==1.25.3