Last active
December 31, 2022 10:15
-
-
Save taidalog/2967784a2d92065090ff2d0e0b4b7a9d to your computer and use it in GitHub Desktop.
F# script to calculate Jaro Similarity.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Taidalog | |
[<RequireQualifiedAccess>] | |
module Seq = | |
let countWith projection source = | |
source | |
|> Seq.filter projection | |
|> Seq.length | |
let trySkip count source = | |
if count > (source |> Seq.length) then | |
None | |
else | |
Seq.skip count source |> Some | |
[<RequireQualifiedAccess>] | |
module String = | |
let tryMid start len str = | |
str | |
|> Seq.trySkip start | |
|> Option.map (Seq.truncate len) | |
|> Option.map (Seq.map string) | |
|> Option.map (String.concat "") | |
[<RequireQualifiedAccess>] | |
module Tuple = | |
let map mapping (x, y) = | |
mapping x, mapping y | |
module StringMetric = | |
open System | |
let jaroSimilarity s1 s2 = | |
let midStart start range = Math.Max(0, start - range) | |
let midLen start range = Math.Min(range * 2 + 1, start + range + 1) | |
let getNearMatch range s1 s2 = | |
s1 | |
|> Seq.mapi | |
(fun i c-> | |
c, (s2 |> String.tryMid (midStart i range) (midLen i range))) | |
|> Seq.map (fun (c, s) -> c, Option.defaultValue "" s) | |
|> Seq.filter | |
(fun (c, s) -> s |> Seq.contains c) | |
|> Seq.map (fun (c, _) -> c) | |
let matchIndexes s1 s2 range = | |
s1 | |
|> Seq.mapi | |
(fun i1 c1-> | |
i1, c1, String.tryMid (midStart i1 range) (midLen i1 range) s2) | |
|> Seq.map (fun (i1, c1, s) -> i1, c1, Option.defaultValue "" s) | |
|> Seq.map | |
(fun (i1, c1, sub) -> | |
sub | |
|> Seq.indexed | |
|> Seq.filter (fun (i2, subc) -> subc = c1) | |
|> Seq.map (fun (i2, _) -> i2 + midStart i1 range) | |
|> Seq.toList) | |
|> Seq.toList | |
let rec updateBoolList indexes source = | |
let indexes' = indexes |> List.filter (fun i -> i < List.length source) | |
match indexes' with | |
| [] -> source | |
| _ -> | |
let h = indexes' |> List.head | |
if source |> List.item h = false then | |
source |> List.updateAt h true | |
else | |
let t = indexes' |> List.tail | |
updateBoolList t source | |
let w1 = 1. / 3. | |
let w2 = 1. / 3. | |
let wt = 1. / 3. | |
let d = s1 |> String.length |> double | |
let r = s2 |> String.length |> double | |
let range = Math.Max(d, r) |> fun x -> x / 2. - 1. |> int | |
let indexes = matchIndexes s1 s2 range | |
let initBools = List.init (int d) (fun _ -> false) | |
let updatedBools = | |
(initBools, indexes) | |
||> List.fold (fun acc x -> updateBoolList x acc) | |
let sub1 = getNearMatch range s1 s2 | |
let sub2 = | |
(s2 |> Seq.toList, updatedBools |> List.toSeq) | |
||> Seq.zip | |
|> Seq.filter (fun (c, b) -> b) | |
|> Seq.map (fun (c, _) -> c) | |
let c = sub1 |> Seq.length |> double | |
let t = | |
Seq.zip sub1 sub2 | |
|> Seq.countWith (fun (x, y) -> x <> y) | |
|> double | |
|> fun x -> x / 2. | |
if s1 = "" || s2 = "" then | |
None | |
else if c = 0. then | |
Some 0. | |
else | |
Some (w1 * c / d + w2 * c / r + wt * (c - t) / c) | |
let jarotaidalogSimilarity s1 s2 = | |
let w1 = | |
(s1, s2) | |
|> Tuple.map (String.length >> double) | |
||> fun x y -> 1. - ((x - y) / x |> abs) | |
let subStr = | |
s1 | |
|> Seq.truncate (String.length s2) | |
let w2 = | |
subStr | |
|> Seq.filter (fun c -> s2 |> Seq.contains c) | |
|> Seq.length | |
|> double | |
|> fun x -> x / (s2 |> String.length |> double) | |
(w1, w2, s2) |||> printfn "w1: %f, w2: %f, s2: %s" | |
if s1 = "" || s2 = "" then | |
None | |
else | |
jaroSimilarity s1 s2 | |
|> Option.map (fun s -> s * List.average [w1; w2]) | |
module Main = | |
open StringMetric | |
let d0 option = Option.defaultValue 0. option | |
let score option = | |
option | |
|> Option.map (fun s -> s * 5. + 0.5 |> int) | |
let main0 () = | |
let s1 = "FAREMVIEL" | |
let s2 = "FARMVILLE" | |
jaroSimilarity s1 s2 |> printfn "%A" | |
let ss = [ | |
"うろ覚え", "うる覚え" | |
"あめんぼあかいなあいうえお", "あめんぼ赤いなあいうえお" | |
"たいだ", "まじめ" | |
] | |
ss | |
|> List.map (fun (s1, s2) -> s1, s2, jaroSimilarity s1 s2) | |
|> List.iter (fun (s1, s2, j) -> printfn "%s, %s: %A" s1 s2 j) | |
|> ignore | |
let main1 () = | |
let s1 = | |
"あめんぼあかいなあいうえお" | |
let ss = [ | |
"あめんぼあかいなあいうえお" | |
"あめんぼ赤いなあいうえお" | |
"あめんぼ紅いなあいうえお" | |
"あめんぼあかいなかきくけこ" | |
"あんめぼあかいなあいうえお" | |
"なあいうえお" | |
"あああああああああああああ" | |
"子丑寅卯辰巳午未申酉戌亥猫" | |
"あ" | |
"ああああめんぼ" | |
] | |
ss | |
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s) | |
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s) | |
|> ignore | |
let main2 () = | |
let s1 = | |
"あさきゆめみしゑひもせす" | |
let ss = [ | |
"あさきゆめみしゑひもせす" | |
"あさゆきめみしゑひもせす" | |
"あさきゆめみしゑいもせす" | |
"あさき夢みしゑひもせす" | |
"あさきゆめみしあいうえお" | |
"あ" | |
"ああああああああああああ" | |
"しゑひもせす" | |
"子丑寅卯辰巳午未申酉戌亥" | |
"あああさきゆ" | |
] | |
ss | |
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s) | |
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s) | |
|> ignore | |
let main3 () = | |
let s1 = | |
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。" | |
let ss = [ | |
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。" | |
"あのイーハトーヴォの透きとおった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。" | |
"あのイーハトーヴォのすきとおったかぜ、なつでもそこにつめたさをもつあおいそら、うつくしいもりでかざられたもりーおし、こうがいのぎらぎらひかるくさのなみ。" | |
"あのイーハトーヴォのすきおとった風、夏でも底に冷たさをもつ青いそら、うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。" | |
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、" | |
"うつくしい森で飾られたモリーオ市、郊外のぎらぎらひかる草の波。" | |
"あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら、あああああああああああああああああああああああああああああああ" | |
"あああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああ" | |
"あ" | |
"The quick brown fox jumps over the lazy dog" | |
"" | |
] | |
ss | |
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s) | |
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s) | |
|> ignore | |
let main4 () = | |
let s1 = | |
"いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす" | |
let ss = [ | |
"いろはにほへとちりぬるをわかよたれそつねならむうゐのおくやまけふこえてあさきゆめみしゑひもせす" | |
"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん" | |
] | |
ss | |
|> List.map (fun s -> jaroSimilarity s1 s, jarotaidalogSimilarity s1 s, s) | |
|> List.iter (fun (j, t, s) -> printfn "Jaro: %A, Jaro-taida: %A, %s" j t s) | |
|> ignore |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment