Skip to content

Instantly share code, notes, and snippets.

@taidalog
Last active December 31, 2022 10:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taidalog/2967784a2d92065090ff2d0e0b4b7a9d to your computer and use it in GitHub Desktop.
Save taidalog/2967784a2d92065090ff2d0e0b4b7a9d to your computer and use it in GitHub Desktop.
F# script to calculate Jaro Similarity.
namespace Taidalog
[<RequireQualifiedAccess>]
module Seq =
let countWith projection source =
source
|> Seq.filter projection
|> Seq.length
let trySkip count source =
if count > (source |> Seq.length) then
None
else
Seq.skip count source |> Some
[<RequireQualifiedAccess>]
module String =
let tryMid start len str =
str
|> Seq.trySkip start
|> Option.map (Seq.truncate len)
|> Option.map (Seq.map string)
|> Option.map (String.concat "")
[<RequireQualifiedAccess>]
module Tuple =
let map mapping (x, y) =
mapping x, mapping y
module StringMetric =
open System
let jaroSimilarity s1 s2 =
let midStart start range = Math.Max(0, start - range)
let midLen start range = Math.Min(range * 2 + 1, start + range + 1)
let getNearMatch range s1 s2 =
s1
|> Seq.mapi
(fun i c->
c, (s2 |> String.tryMid (midStart i range) (midLen i range)))
|> Seq.map (fun (c, s) -> c, Option.defaultValue "" s)
|> Seq.filter
(fun (c, s) -> s |> Seq.contains c)
|> Seq.map (fun (c, _) -> c)
let matchIndexes s1 s2 range =
s1
|> Seq.mapi
(fun i1 c1->
i1, c1, String.tryMid (midStart i1 range) (midLen i1 range) s2)
|> Seq.map (fun (i1, c1, s) -> i1, c1, Option.defaultValue "" s)
|> Seq.map
(fun (i1, c1, sub) ->
sub
|> Seq.indexed
|> Seq.filter (fun (i2, subc) -> subc = c1)
|> Seq.map (fun (i2, _) -> i2 + midStart i1 range)
|> Seq.toList)
|> Seq.toList
let rec updateBoolList indexes source =
let indexes' = indexes |> List.filter (fun i -> i < List.length source)
match indexes' with
| [] -> source
| _ ->
let h = indexes' |> List.head
if source |> List.item h = false then
source |> List.updateAt h true
else
let t = indexes' |> List.tail
updateBoolList t source
let w1 = 1. / 3.
let w2 = 1. / 3.
let wt = 1. / 3.
let d = s1 |> String.length |> double
let r = s2 |> String.length |> double
let range = Math.Max(d, r) |> fun x -> x / 2. - 1. |> int
let indexes = matchIndexes s1 s2 range
let initBools = List.init (int d) (fun _ -> false)
let updatedBools =
(initBools, indexes)
||> List.fold (fun acc x -> updateBoolList x acc)
let sub1 = getNearMatch range s1 s2
let sub2 =
(s2 |> Seq.toList, updatedBools |> List.toSeq)
||> Seq.zip
|> Seq.filter (fun (c, b) -> b)
|> Seq.map (fun (c, _) -> c)
let c = sub1 |> Seq.length |> double
let t =
Seq.zip sub1 sub2
|> Seq.countWith (fun (x, y) -> x <> y)
|> double
|> fun x -> x / 2.
if s1 = "" || s2 = "" then
None
else if c = 0. then
Some 0.
else
Some (w1 * c / d + w2 * c / r + wt * (c - t) / c)
let jarotaidalogSimilarity s1 s2 =
let w1 =
(s1, s2)
|> Tuple.map (String.length >> double)
||> fun x y -> 1. - ((x - y) / x |> abs)
let subStr =
s1
|> Seq.truncate (String.length s2)
let w2 =
subStr
|> Seq.filter (fun c -> s2 |> Seq.contains c)
|> Seq.length
|> double
|> fun x -> x / (s2 |> String.length |> double)
(w1, w2, s2) |||> printfn "w1: %f, w2: %f, s2: %s"
if s1 = "" || s2 = "" then
None
else
jaroSimilarity s1 s2
|> Option.map (fun s -> s * List.average [w1; w2])
module Main =
open StringMetric
let d0 option = Option.defaultValue 0. option
let score option =
option
|> Option.map (fun s -> s * 5. + 0.5 |> int)
let main0 () =
let s1 = "FAREMVIEL"
let s2 = "FARMVILLE"
jaroSimilarity s1 s2 |> printfn "%A"
let ss = [
"うろ覚え", "うる覚え"
"あめんぼあかいなあいうえお", "あめんぼ赤いなあいうえお"
"たいだ", "まじめ"
]
ss
|> List.map (fun (s1, s2) -> s1, s2, jaroSimilarity s1 s2)
|> List.iter (fun (s1, s2, j) -> printfn "%s, %s: %A" s1 s2 j)
|> ignore
let main1 () =
let s1 =
"あめんぼあかいなあいうえお"
let ss = [
"あめんぼあかいなあいうえお"
"あめんぼ赤いなあいうえお"
"あめんぼ紅いなあいうえお"
"あめんぼあかいなかきくけこ"
"あんめぼあかいなあいうえお"
"なあいうえお"
"あああああああああああああ"
"子丑寅卯辰巳午未申酉戌亥猫"
"あ"
"ああああめんぼ"
]
ss
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s)
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s)
|> ignore
let main2 () =
let s1 =
"あさきゆめみしゑひもせす"
let ss = [
"あさきゆめみしゑひもせす"
"あさゆきめみしゑひもせす"
"あさきゆめみしゑいもせす"
"あさき夢みしゑひもせす"
"あさきゆめみしあいうえお"
"あ"
"ああああああああああああ"
"しゑひもせす"
"子丑寅卯辰巳午未申酉戌亥"
"あああさきゆ"
]
ss
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s)
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s)
|> ignore
let main3 () =
let s1 =
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。"
let ss = [
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。"
"あのイーハトーヴォの透きとおった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。"
"あのイーハトーヴォのすきとおったかぜ、なつでもそこにつめたさをもつあおいそら、うつくしいもりでかざられたもりーおし、こうがいのぎらぎらひかるくさのなみ。"
"あのイーハトーヴォのすきおとった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。"
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、"
"うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。"
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、あああああああああああああああああああああああああああああああ"
"あああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああ"
"あ"
"The quick brown fox jumps over the lazy dog"
""
]
ss
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s)
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s)
|> ignore
let main4 () =
let s1 =
"いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす"
let ss = [
"いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす"
"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん"
]
ss
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s)
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s)
|> ignore
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment