Skip to content

Instantly share code, notes, and snippets.

@palladin
Last active September 25, 2021 17:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save palladin/bc278fc010e4d244ef7a to your computer and use it in GitHub Desktop.
Save palladin/bc278fc010e4d244ef7a to your computer and use it in GitHub Desktop.
Experiment
#time "on"
let inline readAllHashes nSkip nSentences =
sentencesFile
|> File.ReadLines
|> Stream.ofSeq
|> Stream.skip nSkip
|> Stream.take nSentences
|> Stream.map(fun line ->
let separatorIndex = line.IndexOf(' ')
let sentenceItself = line.Substring(separatorIndex + 1)
sentence2hashes sentenceItself)
let inline shinglingBeginingAndEnd nSkip nSentences =
readAllHashes nSkip nSentences
|> Stream.mapi(fun id hashes -> spair id hashes)
|> Stream.collect(fun pair ->
let id = spair_fst pair
let hashes = spair_snd pair
shingling nWordInShingle hashes
Stream.ofArray |> Stream.map(fun shingle -> spair shingle id))
let inline allGroups nSkip nSentences =
shinglingBeginingAndEnd nSkip nSentences
|> Stream.toSeq
// group by sub sets of hashes
|> ParStream.ofSeq
|> ParStream.groupBy(fun pair -> spair_fst pair)
// filter out groups with only single element inside
|> ParStream.filter(fun (_, g) -> g |> Seq.length > 1)
|> ParStream.map(fun (_, g) -> g |> Stream.ofSeq |> Stream.map(fun pair -> spair_snd pair) |> Stream.toArray)
|> ParStream.toArray
allGroups nSkip nSentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment