Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Experiment
#time "on"
let inline readAllHashes nSkip nSentences =
sentencesFile
|> File.ReadLines
|> Stream.ofSeq
|> Stream.skip nSkip
|> Stream.take nSentences
|> Stream.map(fun line ->
let separatorIndex = line.IndexOf(' ')
let sentenceItself = line.Substring(separatorIndex + 1)
sentence2hashes sentenceItself)
let inline shinglingBeginingAndEnd nSkip nSentences =
readAllHashes nSkip nSentences
|> Stream.mapi(fun id hashes -> spair id hashes)
|> Stream.collect(fun pair ->
let id = spair_fst pair
let hashes = spair_snd pair
shingling nWordInShingle hashes
Stream.ofArray |> Stream.map(fun shingle -> spair shingle id))
let inline allGroups nSkip nSentences =
shinglingBeginingAndEnd nSkip nSentences
|> Stream.toSeq
// group by sub sets of hashes
|> ParStream.ofSeq
|> ParStream.groupBy(fun pair -> spair_fst pair)
// filter out groups with only single element inside
|> ParStream.filter(fun (_, g) -> g |> Seq.length > 1)
|> ParStream.map(fun (_, g) -> g |> Stream.ofSeq |> Stream.map(fun pair -> spair_snd pair) |> Stream.toArray)
|> ParStream.toArray
allGroups nSkip nSentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment