Created
February 10, 2014 18:43
-
-
Save veltman/8921688 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from bs4 import BeautifulSoup | |
contents = requests.get("http://www.nhl.com/scores/htmlreports/20022003/PL020413.HTM").text | |
def NotEmpty(str): | |
return not re.match("^\s*$",str) | |
def SplitOnColumns(unsplit,positions): | |
result = [] | |
for column in positions: | |
result.append(unsplit[column[0]:column[1]+1].strip()) | |
return result | |
soup = BeautifulSoup(contents) | |
table = soup.find("pre").text | |
# Get all nonempty rows | |
rows = filter(NotEmpty,table.split("\r\n")) | |
# For tracking col positions | |
columns = [] | |
# Find the dashes row | |
for row in rows: | |
# If it's all spaces/dashes with at least one dash | |
if re.match("^[\s-]+$",row) and not re.match("^\s+$",row): | |
# Loop through characters, find the start and end position of each column | |
for i,char in enumerate(row): | |
if char == "-": | |
if i == 0 or row[i-1] != "-": | |
start = i | |
if i == len(row)-1 or row[i+1] != "-": | |
end = i | |
columns.append([start,end]) | |
break | |
# Get an array of headers | |
headers = SplitOnColumns(rows[0],columns) | |
as_json = [] | |
# For all rows besides the first two | |
for row in rows[2:]: | |
# Empty object | |
objectified = {} | |
# For each column, set objectified[header] to that value | |
for i,value in enumerate(SplitOnColumns(row,columns)): | |
objectified[headers[i]] = value | |
as_json.append(objectified) | |
print as_json |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment