Skip to content

Instantly share code, notes, and snippets.

@theriley106
Created January 20, 2018 21:51
Show Gist options
  • Save theriley106/9e91733e663c6160199129585e47cb1e to your computer and use it in GitHub Desktop.
Save theriley106/9e91733e663c6160199129585e47cb1e to your computer and use it in GitHub Desktop.
Grabbing College Common Data Sets
import re
from selenium import webdriver
import bs4
def extractCollegeName(title):
print title
for parts in title.split("-"):
if 'university' in str(parts).lower() or 'college' in str(parts).lower() or 'institute' in str(parts).lower():
return parts
return title
def convertURL(element):
return re.findall('href="(.*?)"', str(element))[0]
def downloadAllDataSets():
driver = webdriver.Firefox()
driver.get('https://www.google.com/search?q=site:.edu+filetype:pdf+common+data+set+2016-2017&start=0')
#This will probably go to a Captcha that you need to fill out
raw_input("Continue ")
for i in range(20):
try:
driver.get("https://www.google.com/search?q=site:.edu+filetype:pdf+common+data+set+2016-2017&start={}".format(i * 10))
page = bs4.BeautifulSoup(driver.page_source, 'lxml')
print page.title.string
for result in page.select('.r a'):
try:
url = convertURL(result)
filename = extractCollegeName(result.getText()).strip().replace(" ", "_")
os.system('wget {} -P CDS/ -O {}.pdf -q'.format(url, filename))
except Exception as exp:
print exp
print("Error on {}".format(url))
except:
print("Error on {}".format(i))
if __name__ == "__main__":
downloadAllDataSets()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment