theriley106/scrapeCDS.py

## scrapeCDS.py
import re
from selenium import webdriver
import bs4

def extractCollegeName(title):
	print title
	for parts in title.split("-"):
		if 'university' in str(parts).lower() or 'college' in str(parts).lower() or 'institute' in str(parts).lower():
			return parts
	return title

def convertURL(element):
	return re.findall('href="(.*?)"', str(element))[0]


def downloadAllDataSets():
	driver = webdriver.Firefox()
	driver.get('https://www.google.com/search?q=site:.edu+filetype:pdf+common+data+set+2016-2017&start=0')
	#This will probably go to a Captcha that you need to fill out
	raw_input("Continue ")
	for i in range(20):
		try:
			driver.get("https://www.google.com/search?q=site:.edu+filetype:pdf+common+data+set+2016-2017&start={}".format(i * 10))
			page = bs4.BeautifulSoup(driver.page_source, 'lxml')
			print page.title.string
			for result in page.select('.r a'):
				try:
					url = convertURL(result)
					filename = extractCollegeName(result.getText()).strip().replace(" ", "_")
					os.system('wget {} -P CDS/ -O {}.pdf -q'.format(url, filename))
				except Exception as exp:
					print exp
					print("Error on {}".format(url))
		except:
			print("Error on {}".format(i))

if __name__ == "__main__":
	downloadAllDataSets()
	import re
	from selenium import webdriver
	import bs4

	def extractCollegeName(title):
	print title
	for parts in title.split("-"):
	if 'university' in str(parts).lower() or 'college' in str(parts).lower() or 'institute' in str(parts).lower():
	return parts
	return title

	def convertURL(element):
	return re.findall('href="(.*?)"', str(element))[0]


	def downloadAllDataSets():
	driver = webdriver.Firefox()
	driver.get('https://www.google.com/search?q=site:.edu+filetype:pdf+common+data+set+2016-2017&start=0')
	#This will probably go to a Captcha that you need to fill out
	raw_input("Continue ")
	for i in range(20):
	try:
	driver.get("https://www.google.com/search?q=site:.edu+filetype:pdf+common+data+set+2016-2017&start={}".format(i * 10))
	page = bs4.BeautifulSoup(driver.page_source, 'lxml')
	print page.title.string
	for result in page.select('.r a'):
	try:
	url = convertURL(result)
	filename = extractCollegeName(result.getText()).strip().replace(" ", "_")
	os.system('wget {} -P CDS/ -O {}.pdf -q'.format(url, filename))
	except Exception as exp:
	print exp
	print("Error on {}".format(url))
	except:
	print("Error on {}".format(i))

	if __name__ == "__main__":
	downloadAllDataSets()