Created
December 16, 2014 16:29
-
-
Save anonymous/68a03c7c388119de278c to your computer and use it in GitHub Desktop.
small HTML table scraper and demo in F#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.Net | |
open System.Text | |
// https://social.msdn.microsoft.com/Forums/en-US/5a26bb89-c0e4-4ca4-b0c7-220c5fe1f495/how-to-get-a-html-table-using-regex?forum=regexp | |
(* | |
let table_pattern = "<table.*?>(.*?)</table>" | |
let tr_pattern = "<tr.*?>(.*?)</tr>" | |
let td_pattern = "<td.*?>(.*?)</td>" | |
*) | |
let tagData (tag:string) (html:string): string list = | |
[ for m in RegularExpressions.Regex.Matches(html.Replace("\n", "").Trim().Replace("\r", ""), | |
String.Format("<{0}.*?>(.*?)</{0}>", tag, tag), | |
RegularExpressions.RegexOptions.IgnoreCase) | |
-> m.Groups.Item(1).Value ] | |
let tables(html:string): string list = | |
tagData "table" html | |
let rows(html:string):string list = | |
tagData "tr" html | |
let cells(html:string): string list = | |
tagData "td" html | |
let stripHtml(html:string): string = | |
RegularExpressions.Regex.Replace(html, "<[^>]*>", "") | |
let output (location:string) (latencies:float list) (threshhold:float): unit = | |
printfn "%s min/avg/max = %f/%f/%f" location (latencies |> List.min) (latencies |> List.average) (latencies |> List.max) | |
match (latencies |> List.max) > threshhold with | |
| true -> printfn "Looks like a bad day on the net" |> ignore | |
| false -> printfn "All OK" | |
[<EntryPoint>] | |
let main args = | |
let wc = new WebClient() | |
let html = wc.DownloadString("http://www.verizonenterprise.com/about/network/latency/") | |
tables html | |
|> List.map (fun x -> rows x | |
|> List.map (fun x -> cells x | |
|> List.map stripHtml)) | |
|> List.tail | |
|> List.head | |
|> Seq.skip 2 | |
|> List.ofSeq | |
|> List.tail | |
|> List.map (fun row -> (row |> List.head, row |> List.tail |> List.map float) ) | |
|> List.map (fun (loc,lat) -> (loc, lat, RegularExpressions.Regex.Match(loc, "(\d+.\d+)").Groups.Item(1).Value |> float)) | |
|> List.iter (fun (area,lat,thresh) -> output area lat thresh) | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment