Skip to content

Instantly share code, notes, and snippets.

@ovatsus
Last active January 1, 2016 09:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ovatsus/8123945 to your computer and use it in GitHub Desktop.
Save ovatsus/8123945 to your computer and use it in GitHub Desktop.
Alternative implementation of Screen Scraping College Football Statistics (http://jamessdixon.wordpress.com/2013/12/24/screen-scraping-college-football-statistics/)
#r @"packages\FSharp.Data.2.0.0-alpha2\lib\net40\FSharp.Data.dll"
#r "System.Xml.Linq"
open System.Xml.Linq
open FSharp.Data
open FSharp.Data.Json
open FSharp.Data.Json.Extensions
open FSharp.Net
// Without Type Providers
let getRecrutRankingsJsonString (year:int) =
let url = "http://sports.yahoo.com/footballrecruiting/football/recruiting/teamrank/" + year.ToString() + "/BIG10/all"
// Using Http.RequestString from FSharp.Data instead of WebRequest directly
let htmlString = Http.RequestString url
// Store header in a separate value so we can use .Length, instead of hardcoding 23
let header = "var rankingsTableData ="
let startPosition = htmlString.IndexOf(header)
let headerLength = header.Length
let endPosition = htmlString.IndexOf(";",startPosition)
htmlString.Substring(startPosition+headerLength,endPosition-startPosition-headerLength).Trim()
let getRecrutRankings year =
let data = getRecrutRankingsJsonString year
// Use JsonValue from FSharp.Data
let results = JsonValue.Parse data
results.AsArray() |> Array.map (fun x -> x?name.AsString(), x?rank.AsInteger())
let getConferenceStandingHtmlString (year:int) =
let url = "http://espn.go.com/college-football/conferences/standings/_/id/5/year/" + year.ToString() + "/big-ten-conference"
let htmlString = Http.RequestString url
let divMarkerStartPosition = htmlString.IndexOf("my-teams-table");
let tableStartPosition = htmlString.IndexOf("<table",divMarkerStartPosition);
let tableEndPosition = htmlString.IndexOf("</table",tableStartPosition);
htmlString.Substring(tableStartPosition, tableEndPosition- tableStartPosition+8)
let getConferenceStanding year school =
let data = getConferenceStandingHtmlString year
// Use XDocument from .NET 3.5 instead of old XmlDocument
let xmlDocument = XDocument.Parse data
let keyNode = xmlDocument.Descendants(XName.Get "td")
|> Seq.find (fun node -> node.Value = school)
let valueNode = keyNode.ElementsAfterSelf() |> Seq.head
keyNode.Value, valueNode.Value
let getConferenceStandings year =
let schools =[|"Nebraska";"Michigan";"Northwestern";"Michigan State";"Iowa";
"Minnesota";"Ohio State";"Penn State";"Wisconsin"; "Purdue"; "Indiana"; "Illinois"|]
schools
|> Seq.map (getConferenceStanding year) // Note the use of partial application
|> Seq.sortBy snd
|> Seq.toList
|> List.rev
// With Type Providers
// Step 1: get sample data into the disk
open System.IO
let recrutRankingsSampleJson = getRecrutRankingsJsonString 2013
File.WriteAllText (Path.Combine(__SOURCE_DIRECTORY__, "recrutRankings.json"), recrutRankingsSampleJson)
let conferenceStandingSampleHtml = getConferenceStandingHtmlString 2013
File.WriteAllText (Path.Combine(__SOURCE_DIRECTORY__, "conferenceStanding.html"), conferenceStandingSampleHtml)
// Step 2: use the sample in the type providers
type RecrutRankingsType = JsonProvider<"recrutRankings.json">
type ConferenceStandingType = XmlProvider<"conferenceStanding.html">
let getRecrutRankingsWithTP year =
let data = getRecrutRankingsJsonString year
let results = RecrutRankingsType.Parse data
// you could also use the type provider inline instead of declaring it above:
// let results = JsonProvider<"recrutRankings.json">.Parse data
results |> Array.map (fun x -> x.Name, x.Rank)
// XmlProvider doesn't work so well with Html, but let's try anyway.
// I suggest using Html Agility Pack instead. See http://blog.codebeside.org/blog/2013/10/14/fsharp-for-screen-scraping/
let getConferenceStandingWithTP year school =
let data = getConferenceStandingHtmlString year
let results = ConferenceStandingType.Parse data
let cells =
results.GetTrs()
|> Array.collect (fun tr -> tr.GetTds())
// XmlProvider only has downwards navigation, so we have to workaround it by indexing into the parent collection:
let option1() =
let keyCellIndex = cells |> Array.findIndex (fun td -> td.StringValue = Some school)
let keyCell = cells.[keyCellIndex]
let valueCell = cells.[keyCellIndex+1]
keyCell.StringValue.Value, valueCell.StringValue.Value
// or by falling back to XElement like this:
let option2() =
let keyCell = cells |> Array.find (fun td -> td.StringValue = Some school)
let valueCell = keyCell.XElement.ElementsAfterSelf() |> Seq.head
keyCell.StringValue.Value, valueCell.Value
option1()
let getConferenceStandingsWithTP year =
let schools =[|"Nebraska";"Michigan";"Northwestern";"Michigan State";"Iowa";
"Minnesota";"Ohio State";"Penn State";"Wisconsin"; "Purdue"; "Indiana"; "Illinois"|]
schools
|> Seq.map (getConferenceStandingWithTP year) // Note the use of partial application
|> Seq.sortBy snd
|> Seq.toList
|> List.rev
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment