Skip to content

Instantly share code, notes, and snippets.

with
rs as (
select
game_pk
, event_inning
, top_inning_sw
, max(bat_score) as bat_score_end_of_inning
, sum(event_outs) as event_outs
WITH
rs as (
select
instr(play_string,',',1,6) as comma_position
, play_string
from mytable
where substr(play_string,1,4)='play'
)
@tangotiger
tangotiger / parseSchedule.py
Last active June 20, 2017 23:49
NHL Schedule: JSON to csv/html
import json
print("Parse start")
# http://live.nhl.com/GameData/SeasonSchedule-20152016.json
sourcefile = "C:/Users/TOM/DataNHL/original/SeasonSchedule.json"
targetfile = "C:/Users/TOM/DataNHL/final/parsed_SeasonSchedule.csv"
htmltargetfile = "C:/Users/TOM/DataNHL/final/parsed_SeasonSchedule.htm"
header_row='NHL_GAME_ID' \
@tangotiger
tangotiger / snippet_csv_to_sqlite.py
Created February 21, 2016 16:11
snippet Loading CSV into SQLite
# this takes a standard csv file and then loads it into a databse
# question: what if instead of a csv file, I had an equivalent <table><tr><td> formatted file?
sourcefile = "skeleton.csv"
dshape = discover(resource(sourcefile))
for game_id in list_game_id:
sourcefile = "formatted_{g}.csv".format(g=game_id)
with open(sourcefile,'r') as infile:
odo(sourcefile, targetfile, dshape=dshape)
@tangotiger
tangotiger / parsePlaybyplay.py
Last active February 7, 2016 21:28
Parse the Play by Play file
from bs4 import BeautifulSoup
import re
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
season_id = '20152016'
subseason_id = '02'
@tangotiger
tangotiger / stripPlaybyplay.py
Last active February 7, 2016 21:15
Strip the Play by Play file so we are only left with the play records. Players On Ice not handled.
# takes 2 seconds to run per infile
# source for infile: http://www.nhl.com/scores/htmlreports/20152016/PL020666.HTM
from bs4 import BeautifulSoup
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
@tangotiger
tangotiger / parseRoster.py
Last active February 7, 2016 21:19
Convert HTML file into csv
from bs4 import BeautifulSoup
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
season_id = '20152016'
subseason_id = '02'
datafile_id = 'RO'
@tangotiger
tangotiger / stripRoster.py
Last active February 7, 2016 21:22
Strip the Roster HTML file so we are only left with the TABLE rows we need
print("Parse start")
first_game_id = 1 #+665
last_game_id = 668 #-2
subseason_id = '02'
datafile_id = 'RO'
searchstr1 = '#</td>'
searchstr2 = 'Pos</td>'
@tangotiger
tangotiger / parseTable.py
Last active January 24, 2016 13:19
Navigating Table with BS4
import urllib.request
from bs4 import BeautifulSoup
print("Parse start")
sourcefile = "file:///C:/Users/TOM/PycharmProjects/downloadNHL/datafiles/RO020666_partial.HTM"
html = urllib.request.urlopen(sourcefile)
soup = BeautifulSoup(html, "lxml")
@tangotiger
tangotiger / parseSchedule.py
Last active February 7, 2016 21:24
Parse Schedule
print("Parse start")
sourcefile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\schedulebyseason.htm"
targetfile = "C:\\Users\\TOM\\PycharmProjects\\downloadNHL\\datafiles\\parsed_schedulebyseason.txt"
searchstr = "recap?id="
sample_recstr = "2015020001"
reclen = len(sample_recstr)