Skip to content

Instantly share code, notes, and snippets.

@veltman
Created February 10, 2014 18:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save veltman/8921688 to your computer and use it in GitHub Desktop.
Save veltman/8921688 to your computer and use it in GitHub Desktop.
import requests
import re
from bs4 import BeautifulSoup
contents = requests.get("http://www.nhl.com/scores/htmlreports/20022003/PL020413.HTM").text
def NotEmpty(str):
return not re.match("^\s*$",str)
def SplitOnColumns(unsplit,positions):
result = []
for column in positions:
result.append(unsplit[column[0]:column[1]+1].strip())
return result
soup = BeautifulSoup(contents)
table = soup.find("pre").text
# Get all nonempty rows
rows = filter(NotEmpty,table.split("\r\n"))
# For tracking col positions
columns = []
# Find the dashes row
for row in rows:
# If it's all spaces/dashes with at least one dash
if re.match("^[\s-]+$",row) and not re.match("^\s+$",row):
# Loop through characters, find the start and end position of each column
for i,char in enumerate(row):
if char == "-":
if i == 0 or row[i-1] != "-":
start = i
if i == len(row)-1 or row[i+1] != "-":
end = i
columns.append([start,end])
break
# Get an array of headers
headers = SplitOnColumns(rows[0],columns)
as_json = []
# For all rows besides the first two
for row in rows[2:]:
# Empty object
objectified = {}
# For each column, set objectified[header] to that value
for i,value in enumerate(SplitOnColumns(row,columns)):
objectified[headers[i]] = value
as_json.append(objectified)
print as_json
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment