Skip to content

Instantly share code, notes, and snippets.

@MattMS
Created December 12, 2022 07:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MattMS/7288108299a33a5bb3457d80acd81008 to your computer and use it in GitHub Desktop.
Save MattMS/7288108299a33a5bb3457d80acd81008 to your computer and use it in GitHub Desktop.
Encoding character pairs
let allChars =
Set.ofSeq
<| seq {
for c in (System.Char.MinValue) .. (System.Char.MaxValue) do
if System.Char.IsControl(c) |> not then
yield c
}
let mostCommonPair (s: string) =
s
|> Seq.windowed 2
|> Seq.countBy id
|> Seq.sortBy snd
|> Seq.last
|> fst
|> System.String.Concat
let getUnused (unused: Set<char>) (target: string) (s: string) =
let used = Set.ofSeq s
unused - used |> Set.minElement |> string
let shrink (s, swaps) =
let pair = mostCommonPair s
let t = getUnused allChars pair s
s.Replace(pair, t), ((t, pair) :: swaps)
// - https://en.wikipedia.org/wiki/Byte_pair_encoding
// - https://en.wikipedia.org/wiki/Re-Pair
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment