Last active
August 29, 2015 14:15
-
-
Save jamessdixon/572a4739f4633c64a6b4 to your computer and use it in GitHub Desktop.
Scraping WCPSS School Assignment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r "../packages/FSharp.Data.2.1.1/lib/net40/FSharp.Data.dll" | |
open System.Net | |
open System.Text | |
open FSharp.Data | |
type context = HtmlProvider<"../data/HouseSearchSample01.html"> | |
type context' = HtmlProvider<"../data/HouseSearchSample02.html"> | |
type SearchCriteria = {streetTemplateValue:string; | |
streetName:string; | |
streetNumber:string;} | |
type SearchCriteria' = {catchmentCode:string; | |
streetName:string; | |
streetTemplateValue:string; | |
streetNumber:string; | |
streetZipCode:string} | |
let uri = "http://wwwgis2.wcpss.net/addressLookup/index.php" | |
let composeStreetNameFromIndividualComponents(street:string, suffix:string, city:string) = | |
street + "+" + suffix + "+" + city | |
let createPage1QueryString(searchCriteria:SearchCriteria)= | |
let stringBuilder = new StringBuilder() | |
stringBuilder.Append("StreetTemplateValue=") |> ignore | |
stringBuilder.Append(searchCriteria.streetTemplateValue) |> ignore | |
stringBuilder.Append("&StreetName=") |> ignore | |
stringBuilder.Append(searchCriteria.streetName) |> ignore | |
stringBuilder.Append("&StreetNumber=") |> ignore | |
stringBuilder.Append(searchCriteria.streetNumber) |> ignore | |
stringBuilder.Append("&SubmitAddressSelectPage=CONTINUE") |> ignore | |
stringBuilder.Append("&DefaultAction=SubmitAddressSelectPage") |> ignore | |
stringBuilder.ToString() | |
let getServerGeneratedParameters(queryString:string)= | |
try | |
use webClient = new WebClient() | |
webClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded") | |
let result = webClient.UploadString(uri,"POST",queryString) | |
let body = context.Parse(result).Html.Body() | |
let inputs = body.Descendants("INPUT") |> Seq.toList | |
let inputs' = inputs |> Seq.map(fun i -> i.Attribute("name").Value(),i.Attribute("value").Value()) | |
let getValueFromInput(nameToFind:string) = | |
inputs' |> Seq.filter(fun (n,v) -> n = nameToFind) | |
|> Seq.map(fun (n,v) -> v) | |
|> Seq.head | |
let catchmentCode = getValueFromInput("CatchmentCode") | |
let catchmentCode' = catchmentCode.Replace(" ","+") | |
let streetZipCode = getValueFromInput("StreetZipCode") | |
let result' = catchmentCode', streetZipCode | |
Some result' | |
with | |
| :? System.ArgumentException -> None | |
| :? System.Exception -> None | |
let createSearchCriteria' (searchCriteria:SearchCriteria) = | |
let page1QueryString = createPage1QueryString(searchCriteria) | |
let serverParameters = getServerGeneratedParameters(page1QueryString) | |
match serverParameters.IsSome with | |
| true -> | |
Some {catchmentCode=fst serverParameters.Value; | |
streetName=searchCriteria.streetName; | |
streetTemplateValue=searchCriteria.streetTemplateValue; | |
streetNumber=searchCriteria.streetNumber; | |
streetZipCode=snd serverParameters.Value} | |
| false -> None | |
let createPage2QueryString(searchCriteria:option<SearchCriteria'>)= | |
match searchCriteria.IsSome with | |
| true -> | |
let stringBuilder = new StringBuilder() | |
stringBuilder.Append("SelectAssignment%7C2014%7CCURRENT=2014-15") |> ignore | |
stringBuilder.Append("&DefaultAction=SelectAssignment%7C2014%7CCURRENT") |> ignore | |
stringBuilder.Append("&DefaultAction=SelectAssignment%7C2015%7CCURRENT") |> ignore | |
stringBuilder.Append("&CatchmentCode=") |> ignore | |
stringBuilder.Append(searchCriteria.Value.catchmentCode) |> ignore | |
stringBuilder.Append("&StreetName=") |> ignore | |
stringBuilder.Append(searchCriteria.Value.streetName) |> ignore | |
stringBuilder.Append("&StreetTemplateValue=") |> ignore | |
stringBuilder.Append(searchCriteria.Value.streetTemplateValue) |> ignore | |
stringBuilder.Append("&StreetNumber=") |> ignore | |
stringBuilder.Append(searchCriteria.Value.streetNumber) |> ignore | |
stringBuilder.Append("&StreetZipCode=") |> ignore | |
stringBuilder.Append(searchCriteria.Value.streetZipCode) |> ignore | |
let result = stringBuilder.ToString() | |
Some result | |
| false -> None | |
let getSchoolData(queryString:option<string>) = | |
match queryString.IsSome with | |
| true -> | |
use webClient = new WebClient() | |
webClient.Headers.Add("Content-Type", "application/x-www-form-urlencoded") | |
let result = webClient.UploadString(uri,"POST",queryString.Value) | |
let body = context'.Parse(result).Html.Body() | |
let tables = body.Descendants("TABLE") |> Seq.toList | |
let schoolTable = tables.[0] | |
let schoolRows = schoolTable.Descendants("TR") |> Seq.toList | |
let schoolData = schoolRows |> Seq.collect(fun r -> r.Descendants("TD")) |>Seq.toList | |
let schoolData' = schoolData |> Seq.map(fun d -> d.InnerText().Trim()) | |
let schoolData'' = schoolData' |> Seq.filter(fun s -> s <> System.String.Empty) | |
let removeNonEssentialData (s:string) = | |
let markerPosition = s.IndexOf('(') | |
match markerPosition with | |
| -1 -> s | |
| _ -> s.Substring(0,markerPosition).Trim() | |
let schoolData''' = schoolData'' |> Seq.map(fun s -> removeNonEssentialData(s)) | |
let unimportantPhrases = [|"Neighborhood Busing"; | |
"This school has an enrollment cap"; | |
"2015 BASE ATTENDANCE AREA"; | |
"2014 BASE ATTENDANCE AREA"|] | |
let containsUnimportantPhrase (s:string) = | |
unimportantPhrases |> Seq.exists(fun p -> s.Contains(p)) | |
let schoolData'''' = schoolData''' |> Seq.filter(fun s -> containsUnimportantPhrase(s) = false ) | |
Some schoolData'''' | |
| false -> None | |
let streetName = composeStreetNameFromIndividualComponents("Strathorn","Dr","Cary") | |
let searchCriteria = {SearchCriteria.streetTemplateValue="STRATH";streetName=streetName;streetNumber="904"} | |
let result = createSearchCriteria'(searchCriteria) | |
|> createPage2QueryString | |
|> getSchoolData | |
result.Value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment