omarayad1/acm-archive-scraper.py

## acm-archive-scraper.py
from bs4 import BeautifulSoup
import requests
import os
import tarfile
import sys

try:
	os.makedirs('archive')
except OSError:
	pass
archive = 'https://icpcarchive.ecs.baylor.edu/index.php?option=com_onlinejudge&Itemid=8&category=0'

data = requests.get(archive).content

data2 = BeautifulSoup(data).find_all(class_='sectiontableentry1')
data2 += BeautifulSoup(data).find_all(class_='sectiontableentry2')
data = {x.find('a').text: 'http://icpcarchive.ecs.baylor.edu/%s' %x.find('a')['href'] for x in data2}

for key in data.keys():
	try:
		os.makedirs('archive/%s' %key)
	except OSError:
		pass
rec_links = []
for key, value in data.iteritems():
	batee5 = requests.get(value).content
	data2 = BeautifulSoup(batee5).find_all(class_='sectiontableentry1')
	data2 += BeautifulSoup(batee5).find_all(class_='sectiontableentry2')
	rec_links.append({'%s/%s' %(key, x.find('a').text): 'https://icpcarchive.ecs.baylor.edu/%s' %x.find('a')['href'] for x in data2})


for contest in rec_links:
	for key, value in contest.iteritems():
		print '--- Downloading Region/Contest: %s' %key
		try:
			os.makedirs('archive/%s' %key)
		except OSError:
			pass
		batee5 = requests.get(value).content
		data2 = BeautifulSoup(batee5).find_all(class_='sectiontableentry1')
		data2 += BeautifulSoup(batee5).find_all(class_='sectiontableentry2')
		data = {'archive/%s/%s.pdf' %(key, x.find('a').text): 'https://icpcarchive.ecs.baylor.edu/%s' %x.find('a')['href'] for x in data2}
		for filename, url in data.iteritems():
			if not os.path.isfile(filename):
				batee5 = requests.get(url).content
				data2 = 'https://icpcarchive.ecs.baylor.edu/' + BeautifulSoup(batee5).find_all('a')[18]['href']
				try:
					batee5 = requests.get(data2).content
					pdf = open(filename, 'w')
					pdf.write(batee5)
					pdf.close()
				except TypeError:
					print url
			else:
				continue
	from bs4 import BeautifulSoup
	import requests
	import os
	import tarfile
	import sys

	try:
	os.makedirs('archive')
	except OSError:
	pass
	archive = 'https://icpcarchive.ecs.baylor.edu/index.php?option=com_onlinejudge&Itemid=8&category=0'

	data = requests.get(archive).content

	data2 = BeautifulSoup(data).find_all(class_='sectiontableentry1')
	data2 += BeautifulSoup(data).find_all(class_='sectiontableentry2')
	data = {x.find('a').text: 'http://icpcarchive.ecs.baylor.edu/%s' %x.find('a')['href'] for x in data2}

	for key in data.keys():
	try:
	os.makedirs('archive/%s' %key)
	except OSError:
	pass
	rec_links = []
	for key, value in data.iteritems():
	batee5 = requests.get(value).content
	data2 = BeautifulSoup(batee5).find_all(class_='sectiontableentry1')
	data2 += BeautifulSoup(batee5).find_all(class_='sectiontableentry2')
	rec_links.append({'%s/%s' %(key, x.find('a').text): 'https://icpcarchive.ecs.baylor.edu/%s' %x.find('a')['href'] for x in data2})


	for contest in rec_links:
	for key, value in contest.iteritems():
	print '--- Downloading Region/Contest: %s' %key
	try:
	os.makedirs('archive/%s' %key)
	except OSError:
	pass
	batee5 = requests.get(value).content
	data2 = BeautifulSoup(batee5).find_all(class_='sectiontableentry1')
	data2 += BeautifulSoup(batee5).find_all(class_='sectiontableentry2')
	data = {'archive/%s/%s.pdf' %(key, x.find('a').text): 'https://icpcarchive.ecs.baylor.edu/%s' %x.find('a')['href'] for x in data2}
	for filename, url in data.iteritems():
	if not os.path.isfile(filename):
	batee5 = requests.get(url).content
	data2 = 'https://icpcarchive.ecs.baylor.edu/' + BeautifulSoup(batee5).find_all('a')[18]['href']
	try:
	batee5 = requests.get(data2).content
	pdf = open(filename, 'w')
	pdf.write(batee5)
	pdf.close()
	except TypeError:
	print url
	else:
	continue