Skip to content

Instantly share code, notes, and snippets.

@pchiusano
Created November 22, 2022 23:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pchiusano/fb8c33b09d1773cbdbc42538e9a887ca to your computer and use it in GitHub Desktop.
Save pchiusano/fb8c33b09d1773cbdbc42538e9a887ca to your computer and use it in GitHub Desktop.
Suffix tree prototyping
-- id is the doc id, a is the text type, v is the value type
unique type Trie id txt
= Remainder txt (Trie.Pos id)
| Branch Nat (Dataset Value (Trie.Pos id)) (Map txt (Value (Trie id txt)))
--Remainder id txt Nat
-- there can be multiple docs at this suffix
-- Branch size
--| Branch Nat (Dataset Value id) (Map txt (Value (Trie id txt)))
unique type Trie.Pos id = Pos id Nat
Trie.Pos.position : Pos id -> Nat
Trie.Pos.position = cases Pos _ i -> i
Trie.Pos.id : Pos id -> id
Trie.Pos.id = cases Pos id _ -> id
Trie.size : Trie id txt -> Nat
Trie.size = cases
Remainder _ _ -> 1
Branch n _ _ -> n
Trie.debug.show : Trie Text Text ->{Remote} Text
Trie.debug.show =
use Text ++
go indent = cases
Remainder txt (Pos doc pos) -> indent ++ "Remainder " ++ doc ++ " \"" ++ txt ++ "\" " ++ Nat.toText pos
Branch n ds subs ->
subsL = Map.toList subs
showPos = cases Pos id position -> "pos " ++ id ++ " " ++ (Nat.toText position)
showDs ds = match Storage.ram.value '(Dataset.toList ds) with
[] -> "[]"
ds -> "[" ++ (List.map showPos ds |> Text.join ", ") ++ "]"
showSub = cases (subChar, sub) ->
indent ++ " \"" ++ subChar ++ "\"\n" ++ go (indent ++ " ") (Value.get sub)
showSubs = Text.join "\n" (List.map showSub subsL)
indent ++ "Branch (size=" ++ Nat.toText n ++ ") (here=" ++ (showDs ds) ++ ")\n"
++ showSubs
-- Branch 14 7
-- a
-- Branch
-- b
go ""
-- Todo: merge could be parallel, much like fromSuffixes, with the same thresholding
Dataset.thresholdSave : Nat -> Dataset d a ->{Storage d} Dataset d a
Dataset.thresholdSave maxUnsaved d =
if Dataset.unsavedDepth d >= maxUnsaved then Dataset.save d
else d
Trie.merge : Rope.Chunk txt -> (forall a . a ->{Remote} Value a) -> Trie id txt -> Trie id txt ->{Remote} Trie id txt
Trie.merge T save = cases
Remainder txt pos, t -> Trie.insert T save txt pos t
t, Remainder txt pos -> Trie.insert T save txt pos t
Branch sz1 here1 subs1, Branch sz2 here2 subs2 ->
sz = sz1 + sz2
here = Storage.saveWith save '(Dataset.thresholdSave 5 (Dataset.union here1 here2))
combine vt1 vt2 = save (Trie.merge T save (Value.get vt1) (Value.get vt2))
Branch sz here (Map.unionWith combine subs1 subs2)
Trie.insert : Rope.Chunk txt -> (forall a . a ->{Remote} Value a) -> txt -> Pos id -> Trie id txt ->{Remote} Trie id txt
Trie.insert T save txt pos = cases
Remainder txt2 pos2 -> match (Chunk.uncons T txt, Chunk.uncons T txt2) with
(Some (hd1,tl1), Some (hd2,tl2)) ->
rem1 = Remainder tl1 pos
rem2 = Remainder tl2 pos2
if hd1 === hd2 then
Branch 2 Dataset.empty (Map.fromList [(hd1, save (Trie.merge T save rem1 rem2 ))])
else
Branch 2 Dataset.empty (Map.fromList [(hd1, save rem1), (hd2, save rem2)])
(None, Some (hd2, tl2)) ->
Branch 2 (saveWith save '(Dataset.fromList [pos])) (Map.fromList [(hd2, save (Remainder tl2 pos2))])
(Some (hd, t1), None) ->
Branch 2 (saveWith save '(Dataset.fromList [pos2])) (Map.fromList [(hd, save (Remainder t1 pos))])
(None, None) -> Branch 2 (saveWith save '(Dataset.fromList [pos, pos2])) Map.empty
Branch sz here subs -> match Chunk.uncons T txt with
None -> Branch (sz + 1) (saveWith save '(Dataset.insert pos here)) subs
Some (hd, tl) ->
f = cases
None -> Some (save (Remainder tl pos))
Some t -> Some (save (Trie.insert T save tl pos (Value.get t)))
Branch (sz + 1) here (Map.alter f hd subs)
-- Remainder
-- doc1 - "zabra."
-- doc2 - "zabracadabra."
-- "." is a unique suffix in doc1 and in doc2, so it will become a Remainder
--
> Remote.pure.run do
t1 = Trie.fromSuffixes Chunk.Text Value.pure 100 "doc1" "ab"
t2 = Trie.fromSuffixes Chunk.Text Value.pure 100 "doc2" "b"
msg = Trie.merge Chunk.Text Value.pure t1 t2 |> Trie.debug.show
trace ("\n" ++ msg) ()
-- todo: ensure that Trie.fromSuffixes never produces a Remainder with an empty remainder
up.List.filterNot pred = List.filter (x -> not (pred x))
Trie.empty = Branch 0 Dataset.empty Map.empty
Trie.fromSuffixes : Rope.Chunk txt
-> (forall a . a ->{Remote} Value a)
-> Nat
-> id
-> txt
->{Remote} Trie id txt
Trie.fromSuffixes T save parThreshold id txt =
txtSize = Rope.Chunk.size T txt
go : List.Nonempty (txt,Nat) ->{Remote} Value (Trie id txt)
go = cases
Nonempty (txt,i) [] -> save (Remainder txt (Pos id i))
txts ->
subs = List.groupBy (at1 >> Chunk.take T 1) (toList txts) |> Map.toList
rec = cases (ch, cs0) ->
cs : Nonempty (txt, Nat)
cs = Nonempty.map (cases (suffix, i) -> (Chunk.drop T 1 suffix, i)) cs0
if Nonempty.size cs > parThreshold then
(ch, Left (Remote.forkAt here! '(go cs)))
else
(ch, Right (go cs))
ts = List.map rec subs |> List.map cases
(ch, Left t) -> (ch, Remote.await t)
(ch, Right t) -> (ch, t)
save (Branch (List.Nonempty.size txts) Dataset.empty (Map.fromList ts))
nonempty = List.filter (not << Rope.Chunk.isEmpty T)
match Rope.Chunk.suffixes T txt |> nonempty |> List.indexed with
[] -> Trie.empty
h +: t -> Value.get (go (Nonempty h t))
Trie.subtrie : Rope.Chunk txt -> txt -> Trie id txt ->{Remote} Trie id txt
Trie.subtrie T txt t =
match Chunk.uncons T txt with
None -> t
Some (hd, txt) -> match t with
Branch _ _ children ->
match Map.lookup hd children with
None -> Trie.empty
Some t -> Trie.subtrie T txt (Value.get t)
_ -> Trie.empty
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment