Skip to content

Instantly share code, notes, and snippets.

@patrickvossler18
Created October 12, 2015 01:38
Show Gist options
  • Save patrickvossler18/51232691458b067a980b to your computer and use it in GitHub Desktop.
Save patrickvossler18/51232691458b067a980b to your computer and use it in GitHub Desktop.
Short script to scrape data from Wall Street Journal article about home field advantage for college football using Selenium. The table isn't a static html table so in order to scrape all of the data I needed a way to click through each number while scraping.
from selenium import webdriver
import csv
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
import time
pred_SoS = "SoS.csv"
pred_SoS_headers= ["Rank", "Team", "Conference", "SoS"]
i = 1
def getrows():
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
#print table
for trs in table.find_all('tr'):
tds = trs.find_all('td')
row = [elem.text.strip().encode('utf-8') for elem in tds]
print row
writer.writerow(row)
def nextpage():
value = "Next"
span_xpath = '//a[contains(text(), "' + value + '")]'
elem = driver.find_element_by_xpath(span_xpath)
elem.click()
with open(pred_SoS, 'wb+') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames = pred_SoS_headers, delimiter = ',')
writer.writeheader()
writer = csv.writer(csv_file)
driver = webdriver.Chrome()
driver.get("http://graphics.wsj.com/table/COUNT0903?embed=1")
# time.sleep(5)
while i < 14:
getrows()
nextpage()
i += 1
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment