Skip to content

Instantly share code, notes, and snippets.

@jamessdixon
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jamessdixon/572a4739f4633c64a6b4 to your computer and use it in GitHub Desktop.
Save jamessdixon/572a4739f4633c64a6b4 to your computer and use it in GitHub Desktop.
Scraping WCPSS School Assignment
#r "../packages/FSharp.Data.2.1.1/lib/net40/FSharp.Data.dll"
open System.Net
open System.Text
open FSharp.Data
type context = HtmlProvider<"../data/HouseSearchSample01.html">
type context' = HtmlProvider<"../data/HouseSearchSample02.html">
type SearchCriteria = {streetTemplateValue:string;
streetName:string;
streetNumber:string;}
type SearchCriteria' = {catchmentCode:string;
streetName:string;
streetTemplateValue:string;
streetNumber:string;
streetZipCode:string}
let uri = "http://wwwgis2.wcpss.net/addressLookup/index.php"
let composeStreetNameFromIndividualComponents(street:string, suffix:string, city:string) =
street + "+" + suffix + "+" + city
let createPage1QueryString(searchCriteria:SearchCriteria)=
let stringBuilder = new StringBuilder()
stringBuilder.Append("StreetTemplateValue=") |> ignore
stringBuilder.Append(searchCriteria.streetTemplateValue) |> ignore
stringBuilder.Append("&StreetName=") |> ignore
stringBuilder.Append(searchCriteria.streetName) |> ignore
stringBuilder.Append("&StreetNumber=") |> ignore
stringBuilder.Append(searchCriteria.streetNumber) |> ignore
stringBuilder.Append("&SubmitAddressSelectPage=CONTINUE") |> ignore
stringBuilder.Append("&DefaultAction=SubmitAddressSelectPage") |> ignore
stringBuilder.ToString()
let getServerGeneratedParameters(queryString:string)=
try
use webClient = new WebClient()
webClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded")
let result = webClient.UploadString(uri,"POST",queryString)
let body = context.Parse(result).Html.Body()
let inputs = body.Descendants("INPUT") |> Seq.toList
let inputs' = inputs |> Seq.map(fun i -> i.Attribute("name").Value(),i.Attribute("value").Value())
let getValueFromInput(nameToFind:string) =
inputs' |> Seq.filter(fun (n,v) -> n = nameToFind)
|> Seq.map(fun (n,v) -> v)
|> Seq.head
let catchmentCode = getValueFromInput("CatchmentCode")
let catchmentCode' = catchmentCode.Replace(" ","+")
let streetZipCode = getValueFromInput("StreetZipCode")
let result' = catchmentCode', streetZipCode
Some result'
with
| :? System.ArgumentException -> None
| :? System.Exception -> None
let createSearchCriteria' (searchCriteria:SearchCriteria) =
let page1QueryString = createPage1QueryString(searchCriteria)
let serverParameters = getServerGeneratedParameters(page1QueryString)
match serverParameters.IsSome with
| true ->
Some {catchmentCode=fst serverParameters.Value;
streetName=searchCriteria.streetName;
streetTemplateValue=searchCriteria.streetTemplateValue;
streetNumber=searchCriteria.streetNumber;
streetZipCode=snd serverParameters.Value}
| false -> None
let createPage2QueryString(searchCriteria:option<SearchCriteria'>)=
match searchCriteria.IsSome with
| true ->
let stringBuilder = new StringBuilder()
stringBuilder.Append("SelectAssignment%7C2014%7CCURRENT=2014-15") |> ignore
stringBuilder.Append("&DefaultAction=SelectAssignment%7C2014%7CCURRENT") |> ignore
stringBuilder.Append("&DefaultAction=SelectAssignment%7C2015%7CCURRENT") |> ignore
stringBuilder.Append("&CatchmentCode=") |> ignore
stringBuilder.Append(searchCriteria.Value.catchmentCode) |> ignore
stringBuilder.Append("&StreetName=") |> ignore
stringBuilder.Append(searchCriteria.Value.streetName) |> ignore
stringBuilder.Append("&StreetTemplateValue=") |> ignore
stringBuilder.Append(searchCriteria.Value.streetTemplateValue) |> ignore
stringBuilder.Append("&StreetNumber=") |> ignore
stringBuilder.Append(searchCriteria.Value.streetNumber) |> ignore
stringBuilder.Append("&StreetZipCode=") |> ignore
stringBuilder.Append(searchCriteria.Value.streetZipCode) |> ignore
let result = stringBuilder.ToString()
Some result
| false -> None
let getSchoolData(queryString:option<string>) =
match queryString.IsSome with
| true ->
use webClient = new WebClient()
webClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded")
let result = webClient.UploadString(uri,"POST",queryString.Value)
let body = context'.Parse(result).Html.Body()
let tables = body.Descendants("TABLE") |> Seq.toList
let schoolTable = tables.[0]
let schoolRows = schoolTable.Descendants("TR") |> Seq.toList
let schoolData = schoolRows |> Seq.collect(fun r -> r.Descendants("TD")) |>Seq.toList
let schoolData' = schoolData |> Seq.map(fun d -> d.InnerText().Trim())
let schoolData'' = schoolData' |> Seq.filter(fun s -> s <> System.String.Empty)
let removeNonEssentialData (s:string) =
let markerPosition = s.IndexOf('(')
match markerPosition with
| -1 -> s
| _ -> s.Substring(0,markerPosition).Trim()
let schoolData''' = schoolData'' |> Seq.map(fun s -> removeNonEssentialData(s))
let unimportantPhrases = [|"Neighborhood Busing";
"This school has an enrollment cap";
"2015 BASE ATTENDANCE AREA";
"2014 BASE ATTENDANCE AREA"|]
let containsUnimportantPhrase (s:string) =
unimportantPhrases |> Seq.exists(fun p -> s.Contains(p))
let schoolData'''' = schoolData''' |> Seq.filter(fun s -> containsUnimportantPhrase(s) = false )
Some schoolData''''
| false -> None
let streetName = composeStreetNameFromIndividualComponents("Strathorn","Dr","Cary")
let searchCriteria = {SearchCriteria.streetTemplateValue="STRATH";streetName=streetName;streetNumber="904"}
let result = createSearchCriteria'(searchCriteria)
|> createPage2QueryString
|> getSchoolData
result.Value
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment