Alternative implementation of Screen Scraping College Football Statistics (
#r @"packages\FSharp.Data.2.0.0-alpha2\lib\net40\FSharp.Data.dll"
#r "System.Xml.Linq"
open System.Xml.Linq
open FSharp.Data
open FSharp.Data.Json
open FSharp.Data.Json.Extensions
open FSharp.Net
// Without Type Providers
let getRecrutRankingsJsonString (year:int) =
let url = "" + year.ToString() + "/BIG10/all"
// Using Http.RequestString from FSharp.Data instead of WebRequest directly
let htmlString = Http.RequestString url
// Store header in a separate value so we can use .Length, instead of hardcoding 23
let header = "var rankingsTableData ="
let startPosition = htmlString.IndexOf(header)
let headerLength = header.Length
let endPosition = htmlString.IndexOf(";",startPosition)
let getRecrutRankings year =
let data = getRecrutRankingsJsonString year
// Use JsonValue from FSharp.Data
let results = JsonValue.Parse data
results.AsArray() |> (fun x -> x?name.AsString(), x?rank.AsInteger())
let getConferenceStandingHtmlString (year:int) =
let url = "" + year.ToString() + "/big-ten-conference"
let htmlString = Http.RequestString url
let divMarkerStartPosition = htmlString.IndexOf("my-teams-table");
let tableStartPosition = htmlString.IndexOf("<table",divMarkerStartPosition);
let tableEndPosition = htmlString.IndexOf("</table",tableStartPosition);
htmlString.Substring(tableStartPosition, tableEndPosition- tableStartPosition+8)
let getConferenceStanding year school =
let data = getConferenceStandingHtmlString year
// Use XDocument from .NET 3.5 instead of old XmlDocument
let xmlDocument = XDocument.Parse data
let keyNode = xmlDocument.Descendants(XName.Get "td")
|> Seq.find (fun node -> node.Value = school)
let valueNode = keyNode.ElementsAfterSelf() |> Seq.head
keyNode.Value, valueNode.Value
let getConferenceStandings year =
let schools =[|"Nebraska";"Michigan";"Northwestern";"Michigan State";"Iowa";
"Minnesota";"Ohio State";"Penn State";"Wisconsin"; "Purdue"; "Indiana"; "Illinois"|]
|> (getConferenceStanding year) // Note the use of partial application
|> Seq.sortBy snd
|> Seq.toList
|> List.rev
// With Type Providers
// Step 1: get sample data into the disk
open System.IO
let recrutRankingsSampleJson = getRecrutRankingsJsonString 2013
File.WriteAllText (Path.Combine(__SOURCE_DIRECTORY__, "recrutRankings.json"), recrutRankingsSampleJson)
let conferenceStandingSampleHtml = getConferenceStandingHtmlString 2013
File.WriteAllText (Path.Combine(__SOURCE_DIRECTORY__, "conferenceStanding.html"), conferenceStandingSampleHtml)
// Step 2: use the sample in the type providers
type RecrutRankingsType = JsonProvider<"recrutRankings.json">
type ConferenceStandingType = XmlProvider<"conferenceStanding.html">
let getRecrutRankingsWithTP year =
let data = getRecrutRankingsJsonString year
let results = RecrutRankingsType.Parse data
// you could also use the type provider inline instead of declaring it above:
// let results = JsonProvider<"recrutRankings.json">.Parse data
results |> (fun x -> x.Name, x.Rank)
// XmlProvider doesn't work so well with Html, but let's try anyway.
// I suggest using Html Agility Pack instead. See
let getConferenceStandingWithTP year school =
let data = getConferenceStandingHtmlString year
let results = ConferenceStandingType.Parse data
let cells =
|> Array.collect (fun tr -> tr.GetTds())
// XmlProvider only has downwards navigation, so we have to workaround it by indexing into the parent collection:
let option1() =
let keyCellIndex = cells |> Array.findIndex (fun td -> td.StringValue = Some school)
let keyCell = cells.[keyCellIndex]
let valueCell = cells.[keyCellIndex+1]
keyCell.StringValue.Value, valueCell.StringValue.Value
// or by falling back to XElement like this:
let option2() =
let keyCell = cells |> Array.find (fun td -> td.StringValue = Some school)
let valueCell = keyCell.XElement.ElementsAfterSelf() |> Seq.head
keyCell.StringValue.Value, valueCell.Value
let getConferenceStandingsWithTP year =
let schools =[|"Nebraska";"Michigan";"Northwestern";"Michigan State";"Iowa";
"Minnesota";"Ohio State";"Penn State";"Wisconsin"; "Purdue"; "Indiana"; "Illinois"|]
|> (getConferenceStandingWithTP year) // Note the use of partial application
|> Seq.sortBy snd
|> Seq.toList
|> List.rev
