Created
October 12, 2015 01:38
-
-
Save patrickvossler18/51232691458b067a980b to your computer and use it in GitHub Desktop.
Short script to scrape data from Wall Street Journal article about home field advantage for college football using Selenium. The table isn't a static html table so in order to scrape all of the data I needed a way to click through each number while scraping.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
import csv | |
from bs4 import BeautifulSoup | |
from selenium.webdriver.common.action_chains import ActionChains | |
import time | |
pred_SoS = "SoS.csv" | |
pred_SoS_headers= ["Rank", "Team", "Conference", "SoS"] | |
i = 1 | |
def getrows(): | |
html = driver.page_source | |
soup = BeautifulSoup(html, "lxml") | |
table = soup.find("table") | |
#print table | |
for trs in table.find_all('tr'): | |
tds = trs.find_all('td') | |
row = [elem.text.strip().encode('utf-8') for elem in tds] | |
print row | |
writer.writerow(row) | |
def nextpage(): | |
value = "Next" | |
span_xpath = '//a[contains(text(), "' + value + '")]' | |
elem = driver.find_element_by_xpath(span_xpath) | |
elem.click() | |
with open(pred_SoS, 'wb+') as csv_file: | |
writer = csv.DictWriter(csv_file, fieldnames = pred_SoS_headers, delimiter = ',') | |
writer.writeheader() | |
writer = csv.writer(csv_file) | |
driver = webdriver.Chrome() | |
driver.get("http://graphics.wsj.com/table/COUNT0903?embed=1") | |
# time.sleep(5) | |
while i < 14: | |
getrows() | |
nextpage() | |
i += 1 | |
driver.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment