Skip to content

Instantly share code, notes, and snippets.

Created October 5, 2023 17:42
Show Gist options
  • Save ImaginaryDevelopment/d0e4e51f94f02d9fa142ff7462d11f14 to your computer and use it in GitHub Desktop.
Save ImaginaryDevelopment/d0e4e51f94f02d9fa142ff7462d11f14 to your computer and use it in GitHub Desktop.
some sample code used to scrape data from one set based on another
// walk two sets of data remove overlap
let toLower (x:string) = x.ToLowerInvariant()
let afterLast (delimiter:string) (value:string) =
value[value.LastIndexOf(delimiter) + 1 ..]
let photos = File.ReadAllLines(@"C:\Users\B\Documents\lancephotos.csv") |> Array.skip 2 |> (fun v -> v.Trim '"' |> toLower) |> Array.truncate 1_000
let students = File.ReadAllLines(@"C:\Users\B\Documents\lancestudents.csv") |> Array.skip 2 |> (fun v -> v.Trim '"' |> toLower |> afterLast "\\") // |> Array.truncate 5
let msToSeconds (ms: int64) =
let seconds = ( ms / int64 1000)
// raw data had quotes in it, I didn't read it as a csv so the data was bad
(photos.Any(fun photo -> photo.Contains "\""), students.Any(fun s -> s.Contains "\"")).Dump()
printfn "%i photos, %i students" photos.Length students.Length
let maxI = photos.Length
let commaChameleon (x:float) = x.ToString("N0")
// andMyFriends = total ms elapsed
let rateMe (andMyFriends:int64) i v =
let seconds = msToSeconds andMyFriends
let rate = if seconds > 0 then int64 i / seconds else 0
sprintf "Finished %s(%.2f%%) %A per second in %i seconds" (commaChameleon v) (float v / float maxI) rate seconds
let genericComparer (title:string) fStudents fPredicate =
let timer = System.Diagnostics.Stopwatch.StartNew()
let mutable i = 0
let dumpProgress =
let dc = DumpContainer()
fun (v:int) -> dc.Content <- rateMe timer.ElapsedMilliseconds i v
let students = fStudents students
let items =
|> Seq.filter(fun photo ->
if i % 500 = 0 then
Util.Progress <- i * 100 / maxI
dumpProgress i
i <- i + 1
fPredicate students photo
|> Array.ofSeq
dumpProgress i
title, timer.ElapsedMilliseconds, items.Length
let hashContains () =
genericComparer "hashC" Set.ofArray (fun studs photo -> studs |> Set.exists(fun student -> student.Contains photo) |> not)
let hash2 () =
genericComparer "hash2" Set.ofArray (fun studs photo -> studs |> Set.contains photo |> not)
let dic () =
genericComparer "dic" (fun students -> students |> student -> student, student) |> Map.ofSeq) (fun studs photo -> studs |> Map.containsKey photo |> not)
|> (fun f -> async { return f()})
|> Async.Parallel
|> Async.RunSynchronously
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment