Skip to content

Instantly share code, notes, and snippets.

Created December 16, 2014 16:29
Show Gist options
  • Save anonymous/68a03c7c388119de278c to your computer and use it in GitHub Desktop.
Save anonymous/68a03c7c388119de278c to your computer and use it in GitHub Desktop.
small HTML table scraper and demo in F#
open System
open System.Net
open System.Text
// https://social.msdn.microsoft.com/Forums/en-US/5a26bb89-c0e4-4ca4-b0c7-220c5fe1f495/how-to-get-a-html-table-using-regex?forum=regexp
(*
let table_pattern = "<table.*?>(.*?)</table>"
let tr_pattern = "<tr.*?>(.*?)</tr>"
let td_pattern = "<td.*?>(.*?)</td>"
*)
let tagData (tag:string) (html:string): string list =
[ for m in RegularExpressions.Regex.Matches(html.Replace("\n", "").Trim().Replace("\r", ""),
String.Format("<{0}.*?>(.*?)</{0}>", tag, tag),
RegularExpressions.RegexOptions.IgnoreCase)
-> m.Groups.Item(1).Value ]
let tables(html:string): string list =
tagData "table" html
let rows(html:string):string list =
tagData "tr" html
let cells(html:string): string list =
tagData "td" html
let stripHtml(html:string): string =
RegularExpressions.Regex.Replace(html, "<[^>]*>", "")
let output (location:string) (latencies:float list) (threshhold:float): unit =
printfn "%s min/avg/max = %f/%f/%f" location (latencies |> List.min) (latencies |> List.average) (latencies |> List.max)
match (latencies |> List.max) > threshhold with
| true -> printfn "Looks like a bad day on the net" |> ignore
| false -> printfn "All OK"
[<EntryPoint>]
let main args =
let wc = new WebClient()
let html = wc.DownloadString("http://www.verizonenterprise.com/about/network/latency/")
tables html
|> List.map (fun x -> rows x
|> List.map (fun x -> cells x
|> List.map stripHtml))
|> List.tail
|> List.head
|> Seq.skip 2
|> List.ofSeq
|> List.tail
|> List.map (fun row -> (row |> List.head, row |> List.tail |> List.map float) )
|> List.map (fun (loc,lat) -> (loc, lat, RegularExpressions.Regex.Match(loc, "(\d+.\d+)").Groups.Item(1).Value |> float))
|> List.iter (fun (area,lat,thresh) -> output area lat thresh)
0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment